#!/usr/bin/env python3
"""
Pass 4: CORRECT Scrapling extraction using page.body (not page.text).
Uses SearXNG to find ideal-investisseur URLs, then scrapes and fills all gaps.
"""
import json, re, subprocess, time, sys
sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher
from urllib.request import urlopen, Request
import urllib.parse

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"
SEARXNG = "http://192.168.1.127:8081/search"

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def html_to_text(html_bytes):
    html = html_bytes.decode('utf-8', errors='ignore') if isinstance(html_bytes, bytes) else html_bytes
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
    html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th)[^>]*>', '\n', html, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html)
    for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&rsquo;',"'"),('&#8217;',"'"),('&#8211;','-')]:
        text = text.replace(old, new)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n+', '\n', text)
    return text

def searxng_find(scpi_name, site_filter):
    try:
        q = urllib.parse.quote(f"site:{site_filter} scpi {scpi_name}")
        url = f"{SEARXNG}?q={q}&format=json"
        req = Request(url)
        resp = urlopen(req, timeout=8)
        data = json.loads(resp.read().decode('utf-8'))
        for r in data.get('results', []):
            u = r['url']
            if site_filter in u and '.pdf' not in u:
                return u
    except:
        pass
    return None

def extract_ii(text):
    """Extract from ideal-investisseur text"""
    d = {}
    
    # G: Label ISR
    m = re.search(r'Label\s+ISR\s*(Non|Oui)', text, re.I)
    if m: d['G'] = 'Oui' if m.group(1).lower() == 'oui' else 'Non'
    elif re.search(r'Label\s+ISR', text): d['G'] = 'Oui'
    
    # D: Type capital
    m = re.search(r'Capital\s+(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # K: Endettement - "Taux d'endettement 8,72 %"
    m = re.search(r"Taux\s+d'endettement\s+(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['K'] = m.group(1) + '%'
    
    # S: TOF - "TOF 100,00 %"
    m = re.search(r'TOF\s+(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['S'] = m.group(1) + '%'
    
    # V: Prix souscription - "Prix de souscription 50,00 €"
    m = re.search(r'Prix\s+de\s+souscription\s+(\d[\d\s,.]*€)', text, re.I)
    if m: d['V'] = m.group(1).strip()
    
    # V also as "Valeur de Souscription"
    if 'V' not in d:
        m = re.search(r'(?:Valeur|Prix)\s+de\s+souscription\s*:?\s*(\d[\d\s,.]*€)', text, re.I)
        if m: d['V'] = m.group(1).strip()
    
    # W: Valeur reconstitution - "Valeur de reconstitution 51,60 €"
    m = re.search(r'Valeur\s+de\s+reconstitution\s+(\d[\d\s,.]*€)', text, re.I)
    if m: d['W'] = m.group(1).strip()
    
    # L: Nombre parts
    m = re.search(r'(?:Nombre\s+de\s+parts|Parts?\s+émises?)\s*:?\s*(\d[\d\s]*)', text, re.I)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # M: Parts retrait
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*(?:attente\s*de\s*)?retrait', text, re.I)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # Q: Delai jouissance
    m = re.search(r'(?:Délai|Durée)\s+de\s+jouissance\s*:?\s*(\d+\s*mois)', text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # X: Commission souscription - "Commission de souscription 10,00 %"
    m = re.search(r'(?:Commission|Frais)\s+de\s+souscription\s+(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['X'] = m.group(1) + '% TTC'
    
    # Y: Commission gestion - "Commission de gestion 12,00 %" or "Frais de gestion..."
    m = re.search(r'(?:Commission|Frais)\s+de\s+gestion\s+(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['Y'] = m.group(1) + '% TTC'
    
    # T: TD (Taux de distribution) - "Taux de distribution 2025 : 8,00 %"
    m = re.search(r'Taux\s+de\s+distribution\s*(?:\d{4})?\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['T'] = m.group(1) + '%'
    
    # U: TRI 10 ans
    m = re.search(r'TRI\s+(?:à\s+)?10\s+ans?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['U'] = m.group(1) + '%'
    
    # H: Souscription minimum
    m = re.search(r'(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['H'] = m.group(1).strip()
    
    # P: Fiscalite
    if re.search(r'impôt.*?sociétés.*?allemand|IS\s*15,825', text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r'source étrangère|convention[^.]*fiscale', text, re.I):
        d['P'] = 'Revenus source étrangère'
    elif re.search(r'revenus?\s+fonciers|PFU|prélèvement\s+forfaitaire', text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # N: RAN - "Report à nouveau X jours" or "Report à nouveau X €"
    m = re.search(r'Report\s+à\s+nouveau\s*:?\s*(\d+\s*jours?\s*de\s*distribution)', text, re.I)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r'Report\s+à\s+nouveau\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
        if m: d['N'] = m.group(1).strip()
    
    # O: PGE
    m = re.search(r'(?:Provision|Réserve)\s+pour\s+gros\s+entretien\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
    if m: d['O'] = m.group(1).strip()
    
    # R: Frequence distribution
    if re.search(r'trimestriel', text, re.I): d['R'] = 'Trimestrielle'
    elif re.search(r'semestriel', text, re.I): d['R'] = 'Semestrielle'
    
    # E: Secteur geo
    # On ideal-investisseur: "Répartition géographique" followed by country list
    m = re.search(r'[Rr]épartition\s*géographique\s*\n((?:[^\n]+\n){2,15})', text)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    # F: Secteur activite
    m = re.search(r'(?:Typologie|Répartition)\s*(?:des actifs|sectorielle)\s*\n((?:[^\n]+\n){2,15})', text)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    # I: Date creation
    m = re.search(r'Création\s*:?\s*(\w+\s+\d{4})', text, re.I)
    if m: d['I'] = m.group(1).strip()
    
    # J: Capitalisation - "Capitalisation 42,17 M€"
    m = re.search(r'Capitalisation\s+(\d+[,.]?\d*)\s*(M€|Mds?€|milliards?\s*€?)', text, re.I)
    if m: d['J'] = m.group(1) + ' ' + m.group(2)
    
    return d

# Read sheet
rows = read_sheet()

# Build list of rows needing fixes
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi:
        continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    if empty:
        needs[r] = (scpi, empty)

print(f"Rows needing fixes: {len(needs)}")

fetcher = Fetcher(auto_match=False)
total_cells = 0
total_rows = 0
skipped = []

for r, (scpi, empty) in sorted(needs.items()):
    text = None
    
    # Try ideal-investisseur via SearXNG
    ii_url = searxng_find(scpi, "ideal-investisseur.fr/scpi-avis")
    if ii_url:
        try:
            page = fetcher.get(ii_url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
            if page.status == 200 and page.body and len(page.body) > 2000:
                text = html_to_text(page.body)
        except:
            pass
    
    if not text:
        skipped.append(scpi)
        continue
    
    data = extract_ii(text)
    if not data:
        skipped.append(f"{scpi} (no data extracted)")
        continue
    
    updates = [(col, data[col]) for col in empty if col in data]
    if not updates:
        skipped.append(f"{scpi} (no matching cols)")
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.15)
        else:
            print(f"{col}❌ ", end="", flush=True)
    
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled")
print(f"\nSkipped ({len(skipped)}): {', '.join(skipped[:20])}")
