#!/usr/bin/env python3
"""
Mass fix ALL gaps using verified commentbieninvestir URLs.
Also uses ideal-investisseur for the ones CBI doesn't cover.
"""
import json, re, subprocess, time
from urllib.request import urlopen, Request

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

# Verified CBI URLs
CBI_URLS = {
    "Activimmo": "https://commentbieninvestir.fr/scpi-activimmo/",
    "Aestiam Horizon": "https://commentbieninvestir.fr/scpi-aestiam-horizon/",
    "Alta Convictions": "https://commentbieninvestir.fr/scpi-alta-convictions/",
    "Altixia Cadence XII": "https://commentbieninvestir.fr/altixia-cadence-xii/",
    "Altixia Commerces": "https://commentbieninvestir.fr/scpi-altixia-commerces/",
    "Cap Foncières et Territoires": "https://commentbieninvestir.fr/scpi-cap-foncieres-et-territoires/",
    "Coeur d'Avenir": "https://commentbieninvestir.fr/scpi-coeur-davenir/",
    "Coeur d'Europe": "https://commentbieninvestir.fr/scpi-coeur-deurope/",
    "Coeur de régions": "https://commentbieninvestir.fr/scpi-coeur-de-regions/",
    "Comète": "https://commentbieninvestir.fr/scpi-comete/",
    "Corum Eurion": "https://commentbieninvestir.fr/scpi-corum-eurion/",
    "Corum Origin": "https://commentbieninvestir.fr/scpi-corum-origin/",
    "Corum USA": "https://commentbieninvestir.fr/scpi-corum-usa/",
    "Corum XL": "https://commentbieninvestir.fr/scpi-corum-xl/",
    "Cristal Life": "https://commentbieninvestir.fr/scpi-cristal-life/",
    "Cristal Rente": "https://commentbieninvestir.fr/scpi-cristal-rente/",
    "Darwin RE01": "https://commentbieninvestir.fr/scpi-darwin-re01/",
    "Eden": "https://commentbieninvestir.fr/scpi-eden/",
    "Elevation Tertiom": "https://commentbieninvestir.fr/scpi-elevation-tertiom/",
    "Elialys": "https://commentbieninvestir.fr/scpi-elialys/",
    "Epargne Foncière": "https://commentbieninvestir.fr/epargne-fonciere/",
    "Epargne Pierre": "https://commentbieninvestir.fr/scpi-epargne-pierre/",
    "Epargne Pierre Europe": "https://commentbieninvestir.fr/epargne-pierre-europe/",
    "Epargne Pierre Sophia": "https://commentbieninvestir.fr/scpi-epargne-pierre-sophia/",
    "Epsicap Explore": "https://commentbieninvestir.fr/scpi-epsicap-explore/",
    "Epsicap Nano": "https://commentbieninvestir.fr/scpi-epsicap-nano/",
    "Esprit Horizon": "https://commentbieninvestir.fr/scpi-esprit-horizon/",
    "Eurovalys": "https://commentbieninvestir.fr/scpi-eurovalys/",
    "Euryale Horizons Santé": "https://commentbieninvestir.fr/scpi-euryale-horizons-sante/",
    "Foncière Des Praticiens": "https://commentbieninvestir.fr/scpi-fonciere-des-praticiens/",
    "GMA Essentialis": "https://commentbieninvestir.fr/scpi-gma-essentialis/",
    "Iroko Atlas": "https://commentbieninvestir.fr/scpi-iroko-atlas/",
    "Iroko Zen": "https://commentbieninvestir.fr/scpi-iroko-zen/",
    "LF Avenir Santé": "https://commentbieninvestir.fr/lf-avenir-sante/",
    "LOG IN": "https://commentbieninvestir.fr/scpi-log-in/",
    "Linaclub": "https://commentbieninvestir.fr/scpi-linaclub/",
    "Mistral Sélection": "https://commentbieninvestir.fr/scpi-mistral-selection/",
    "MomenTime": "https://commentbieninvestir.fr/scpi-momentime/",
    "NCap Continent": "https://commentbieninvestir.fr/scpi-ncap-continent/",
    "NCap Education Santé": "https://commentbieninvestir.fr/scpi-ncap-education-sante/",
    "NCap Régions": "https://commentbieninvestir.fr/scpi-ncap-regions/",
    "Optimale": "https://commentbieninvestir.fr/scpi-optimale/",
    "Osmo Energie": "https://commentbieninvestir.fr/scpi-osmo-energie/",
    "Paref Evo": "https://commentbieninvestir.fr/scpi-paref-evo/",
    "Paref Hexa": "https://commentbieninvestir.fr/scpi-paref-hexa/",
    "Paref Prima": "https://commentbieninvestir.fr/scpi-paref-prima/",
    "Perial Grand Paris": "https://commentbieninvestir.fr/scpi-perial-grand-paris/",
    "Perial Hospitalité Europe": "https://commentbieninvestir.fr/scpi-perial-hospitalite-europe/",
    "Perial O2": "https://commentbieninvestir.fr/scpi-perial-o2/",
    "Perial Opportunités Europe": "https://commentbieninvestir.fr/scpi-perial-opportunites-europe/",
    "Pierval Santé": "https://commentbieninvestir.fr/scpi-pierval-sante/",
    "Primopierre": "https://commentbieninvestir.fr/scpi-primopierre/",
    "Primovie": "https://commentbieninvestir.fr/scpi-primovie/",
    "Reason": "https://commentbieninvestir.fr/scpi-reason/",
    "Remake Live": "https://commentbieninvestir.fr/scpi-remake-live/",
    "Sofidynamic": "https://commentbieninvestir.fr/scpi-sofidynamic/",
    "Transitions Europe": "https://commentbieninvestir.fr/scpi-transitions-europe/",
    "Upeka": "https://commentbieninvestir.fr/scpi-upeka/",
    "Wemo One": "https://commentbieninvestir.fr/scpi-wemo-one/",
}

def html_to_text(url):
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
        resp = urlopen(req, timeout=15)
        html = resp.read().decode('utf-8', errors='ignore')
        html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
        html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
        html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
        html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th|dd|dt)[^>]*>', '\n', html, flags=re.I)
        text = re.sub(r'<[^>]+>', ' ', html)
        text = re.sub(r'&nbsp;', ' ', text)
        text = re.sub(r'&amp;', '&', text)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n\s*\n', '\n', text)
        return text
    except:
        return None

def extract(text):
    d = {}
    
    # G: Label ISR
    if re.search(r'[Ll]abel ISR', text): d['G'] = 'Oui'
    elif re.search(r'Non labellisée|pas de label', text, re.I): d['G'] = 'Non'
    
    # D: Type capital
    m = re.search(r'[Cc]apital\s*(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # E: Secteur geo (from Répartition Géographique)
    m = re.search(r'[Rr][ée]partition\s*[Gg][ée]ographique[^"]*?\n((?:[^\n]+\n){2,12})', text)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    # F: Secteur activite
    m = re.search(r'[Rr][ée]partition\s*[Ss]ectorielle[^"]*?\n((?:[^\n]+\n){2,12})', text)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    # H: Souscription min
    m = re.search(r'[Ii]nvestissement\s*minimum\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['H'] = m.group(1).strip()
    
    # K: Endettement
    m = re.search(r'[Tt]aux\s*d\'endettement\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['K'] = m.group(1) + '%'
    
    # L: Nombre parts
    m = re.search(r'[Nn]ombre\s*de\s*parts?\s*(?:émises|en circulation)?\s*\n?\s*:?\s*(\d[\d\s]*)', text)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # M: Parts retrait
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*attente', text)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # N: RAN
    m = re.search(r'[Rr]eport\s*[àa]\s*nouveau\s*\n\s*(\d+\s*jours?\s*de\s*distribution)', text)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r'[Rr]eport\s*[àa]\s*nouveau\s*\n?\s*:?\s*([\d\s,.]+€)', text)
        if m: d['N'] = m.group(1).strip()
    
    # O: PGE
    m = re.search(r'[Pp]rovision.*?[Gg]ros.*?[Ee]ntretien\s*\n?\s*:?\s*([\d\s,.]+)', text)
    if m: d['O'] = m.group(1).strip()
    
    # P: Fiscalite
    if re.search(r'impôt sur les sociétés allemand|IS\s*15,825|fiscalité allemande', text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r'source étrangère|conventions fiscales', text, re.I):
        d['P'] = 'Revenus source étrangère (conventions fiscales)'
    elif re.search(r'revenus fonciers|PFU|prélèvement forfaitaire', text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # Q: Delai jouissance
    m = re.search(r'[Dd][ée]lai\s*de\s*jouissance\s*\n?\s*:?\s*(\d+\s*mois)', text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # R: Freq distribution
    if re.search(r'4\s*acomptes?\s*trimestriel|trimestrielle', text, re.I): d['R'] = 'Trimestrielle'
    elif re.search(r'2\s*acomptes?\s*semestriel|semestrielle', text, re.I): d['R'] = 'Semestrielle'
    elif re.search(r'annuelle', text, re.I): d['R'] = 'Annuelle'
    
    # S: TOF
    m = re.search(r'(?:TOF|Taux\s*d\'[Oo]ccupation\s*[Ff]inancier)\s*(?:\(.*?\))?\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['S'] = m.group(1) + '%'
    
    # T: PGA
    m = re.search(r'[Pp]erformance\s*globale\s*annuelle\s*(?:2025|\s)*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['T'] = m.group(1) + '%'
    
    # U: TRI 10 ans
    m = re.search(r'TRI\s*10\s*ans\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['U'] = m.group(1) + '%'
    
    # V: Prix souscription
    m = re.search(r'[Pp]rix\s*de\s*(?:la\s*part|souscription)\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['V'] = m.group(1).strip()
    
    # W: Val reconstitution
    m = re.search(r'[Vv]aleur\s*de\s*reconstitution\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['W'] = m.group(1).strip()
    
    # X: Commission souscription
    m = re.search(r'[Ff]rais\s*de\s*souscription\s*\n\s*(\d+[,.]?\d*%)\s*(HT|TTC)', text)
    if m: d['X'] = m.group(1) + ' ' + m.group(2)
    
    # Y: Commission gestion
    m = re.search(r'[Ff]rais\s*de\s*gestion\s*\n\s*(\d+[,.]?\d*%)\s*(HT|TTC)', text)
    if m: d['Y'] = m.group(1) + ' ' + m.group(2)
    
    return d

def gws_update(range_str, value):
    cmd = [
        'gws', 'sheets', 'spreadsheets', 'values', 'update',
        '--params', json.dumps({
            "spreadsheetId": SHEET_ID,
            "range": f"Listing sites!{range_str}",
            "valueInputOption": "USER_ENTERED"
        }),
        '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in result.stdout

# Read current sheet
cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
       '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
result = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
sheet = json.loads(result.stdout)
rows = sheet.get('values', [])
headers = rows[0] if rows else []

total_cells = 0
total_rows = 0

for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1] if len(row) > 1 else ''
    if not scpi or scpi not in CBI_URLS:
        continue
    
    # Find empty columns
    empty_cols = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ['', 'N/A', 'Non trouvé', '-']:
            col = chr(65+j)
            empty_cols[col] = j
    
    if not empty_cols:
        continue
    
    # Fetch page
    url = CBI_URLS[scpi]
    text = html_to_text(url)
    if not text:
        continue
    
    data = extract(text)
    if not data:
        continue
    
    # Fill only empty cells
    updates = []
    for col, col_idx in empty_cols.items():
        if col in data:
            updates.append((col, col_idx, data[col]))
    
    if not updates:
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, col_idx, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.15)
        else:
            print(f"{col}❌ ", end="", flush=True)
    
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled")
