#!/usr/bin/env python3
"""
Pass 3: ideal-investisseur.fr — focus on K (endettement), L (nb parts), M (parts retrait),
N (RAN), O (PGE), Q (jouissance), S (TOF), T (PGA), U (TRI), X/Y (commissions)
Uses Scrapling Fetcher for reliable HTML extraction.
"""
import json, re, subprocess, time, sys
sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def html_to_text(page):
    html = page.text
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
    html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th|dd|dt)[^>]*>', '\n', html, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html)
    text = re.sub(r'&nbsp;', ' ', text)
    text = re.sub(r'&amp;', '&', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n', text)
    return text

def slugify(name):
    s = name.lower().strip()
    for a,b in [('é','e'),('è','e'),('ê','e'),('ë','e'),('â','a'),('à','a'),('î','i'),('ï','i'),('ô','o'),('û','u'),('ü','u'),('ç','c'),("'",""),('(',""),(')',""),(' ',"-")]:
        s = s.replace(a,b)
    return s

# Ideal-investisseur URL patterns
def ii_url(name):
    slug = slugify(name)
    # Most SCPI pages use the pattern: /scpi-avis/{slug}-{id}.html
    # We'll search for them
    return None  # We need to find the right URLs first

# Read current sheet state
rows = read_sheet()
headers = rows[0]

# Build map of what needs fixing
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi:
        continue
    
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    
    if empty:
        needs[r] = (scpi, empty)

print(f"Rows needing fixes: {len(needs)}")

# Strategy: use web_search to find ideal-investisseur URLs for SCPIs not covered by CBI
# For now, try direct ideal-investisseur scraping with known URL patterns
fetcher = Fetcher(auto_match=False)

total_cells = 0
total_rows = 0

# Known ideal-investisseur URL mappings (from previous sessions)
# Try searching by slug pattern
II_BASE = "https://www.ideal-investisseur.fr/scpi-avis/"

for r, (scpi, empty) in sorted(needs.items()):
    slug = slugify(scpi)
    # Try various URL patterns
    candidates = [
        f"{II_BASE}{slug}.html",
        f"{II_BASE}{slug}-avis.html",
    ]
    
    # Also try with number suffix (common pattern on ideal-investisseur)
    text = None
    page_url = None
    
    for url in candidates:
        try:
            page = fetcher.get(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
            if page.status == 200 and len(page.text) > 2000:
                text = html_to_text(page)
                page_url = url
                break
        except:
            pass
    
    if not text:
        # Try Google search to find the page
        try:
            from urllib.request import urlopen, Request
            import urllib.parse
            q = urllib.parse.quote(f"site:ideal-investisseur.fr scpi {scpi}")
            search_url = f"https://www.google.com/search?q={q}&num=3"
            req = Request(search_url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'})
            resp = urlopen(req, timeout=10)
            search_html = resp.read().decode('utf-8', errors='ignore')
            urls = re.findall(r'https://www\.ideal-investisseur\.fr/scpi-avis/[^"&\s]+', search_html)
            if urls:
                found_url = urls[0]
                page = fetcher.get(found_url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
                if page.status == 200 and len(page.text) > 2000:
                    text = html_to_text(page)
                    page_url = found_url
        except:
            pass
    
    if not text:
        continue
    
    # Extract data
    d = {}
    
    # K: Endettement
    m = re.search(r'Taux\s+d\'endettement\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['K'] = m.group(1).replace(',', '.') + '%'
    
    # L: Nombre parts
    m = re.search(r'(?:Nombre\s+de\s+parts|Parts?\s+émises?)\s*:?\s*(\d[\d\s]*)', text, re.I)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # M: Parts retrait
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*attente\s*de\s*retrait', text, re.I)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # N: RAN
    m = re.search(r'Report\s+à\s+nouveau\s*:?\s*(\d+\s*jours?\s*de\s*distribution)', text, re.I)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r'Report\s+à\s+nouveau\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
        if m: d['N'] = m.group(1).strip()
    
    # O: PGE
    m = re.search(r'(?:Provision|Réserve)\s+pour\s+gros\s+entretien\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
    if m: d['O'] = m.group(1).strip()
    
    # Q: Delai jouissance
    m = re.search(r'Délai\s+de\s+jouissance\s*:?\s*(\d+\s*mois)', text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # S: TOF
    m = re.search(r'(?:TOF|Taux\s+d\'occupation\s+financière)\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['S'] = m.group(1).replace(',', '.') + '%'
    
    # T: PGA
    m = re.search(r'(?:Performance\s+globale\s+annuelle|PGA)\s*(?:\d{4})?\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['T'] = m.group(1).replace(',', '.') + '%'
    
    # U: TRI
    m = re.search(r'TRI\s+(?:10\s+ans|à\s+10\s+ans)\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['U'] = m.group(1).replace(',', '.') + '%'
    
    # V: Prix souscription
    m = re.search(r'(?:Valeur|Prix)\s+de\s+souscription\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['V'] = m.group(1).strip()
    
    # W: Valeur reconstitution
    m = re.search(r'Valeur\s+de\s+reconstitution\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['W'] = m.group(1).strip()
    
    # X: Commission souscription
    m = re.search(r'(?:Commission|Frais)\s+de\s+souscription\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['X'] = m.group(1).replace(',', '.') + '% TTC'
    
    # Y: Commission gestion
    m = re.search(r'(?:Commission|Frais)\s+de\s+gestion\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['Y'] = m.group(1).replace(',', '.') + '% TTC'
    
    # G: Label ISR
    if re.search(r'Label\s+ISR', text, re.I):
        d['G'] = 'Oui'
    
    # D: Type capital
    m = re.search(r'Capital\s+(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # H: Souscription min
    m = re.search(r'(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['H'] = m.group(1).strip()
    
    # P: Fiscalite
    if re.search(r'impôt\s+sur\s+les\s+sociétés\s+allemand|IS\s*15,825', text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r'revenus\s+fonciers|PFU|prélèvement\s+forfaitaire', text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # E/F: Geo/Secteur
    m = re.search(r'[Rr]épartition\s*géographique\s*:?\s*\n((?:[^\n]+\n){2,10})', text)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    m = re.search(r'[Rr]épartition\s*(?:par\s+)?secteur\s*:?\s*\n((?:[^\n]+\n){2,10})', text)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    # Fill only empty cells
    updates = []
    for col, col_idx in empty.items():
        if col in d:
            updates.append((col, d[col]))
    
    if not updates:
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.15)
        else:
            print(f"{col}❌ ", end="", flush=True)
    
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled via ideal-investisseur")
