#!/usr/bin/env python3
"""
Pass 5: commentbieninvestir.fr via direct URL (verified earlier) for remaining gaps.
Focus: K (endettement), H (souscription), Q (jouissance), X/Y (commissions), 
       D (capital), L (parts), N (RAN), O (PGE)
"""
import json, re, subprocess, time, sys
sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

# Verified CBI URLs from earlier pass
CBI_URLS = {
    "Activimmo": "https://commentbieninvestir.fr/scpi-activimmo/",
    "Aestiam Horizon": "https://commentbieninvestir.fr/scpi-aestiam-horizon/",
    "Alta Convictions": "https://commentbieninvestir.fr/scpi-alta-convictions/",
    "Altixia Cadence XII": "https://commentbieninvestir.fr/altixia-cadence-xii/",
    "Altixia Commerces": "https://commentbieninvestir.fr/scpi-altixia-commerces/",
    "Cap Foncières et Territoires": "https://commentbieninvestir.fr/scpi-cap-foncieres-et-territoires/",
    "Coeur d'Avenir": "https://commentbieninvestir.fr/scpi-coeur-davenir/",
    "Coeur d'Europe": "https://commentbieninvestir.fr/scpi-coeur-deurope/",
    "Coeur de régions": "https://commentbieninvestir.fr/scpi-coeur-de-regions/",
    "Comète": "https://commentbieninvestir.fr/scpi-comete/",
    "Corum Eurion": "https://commentbieninvestir.fr/scpi-corum-eurion/",
    "Corum Origin": "https://commentbieninvestir.fr/scpi-corum-origin/",
    "Corum USA": "https://commentbieninvestir.fr/scpi-corum-usa/",
    "Corum XL": "https://commentbieninvestir.fr/scpi-corum-xl/",
    "Cristal Life": "https://commentbieninvestir.fr/scpi-cristal-life/",
    "Cristal Rente": "https://commentbieninvestir.fr/scpi-cristal-rente/",
    "Darwin RE01": "https://commentbieninvestir.fr/scpi-darwin-re01/",
    "Eden": "https://commentbieninvestir.fr/scpi-eden/",
    "Elevation Tertiom": "https://commentbieninvestir.fr/scpi-elevation-tertiom/",
    "Elialys": "https://commentbieninvestir.fr/scpi-elialys/",
    "Epargne Foncière": "https://commentbieninvestir.fr/epargne-fonciere/",
    "Epargne Pierre": "https://commentbieninvestir.fr/scpi-epargne-pierre/",
    "Epargne Pierre Europe": "https://commentbieninvestir.fr/epargne-pierre-europe/",
    "Epargne Pierre Sophia": "https://commentbieninvestir.fr/scpi-epargne-pierre-sophia/",
    "Epsicap Explore": "https://commentbieninvestir.fr/scpi-epsicap-explore/",
    "Epsicap Nano": "https://commentbieninvestir.fr/scpi-epsicap-nano/",
    "Esprit Horizon": "https://commentbieninvestir.fr/scpi-esprit-horizon/",
    "Eurovalys": "https://commentbieninvestir.fr/scpi-eurovalys/",
    "Euryale Horizons Santé": "https://commentbieninvestir.fr/scpi-euryale-horizons-sante/",
    "Foncière Des Praticiens": "https://commentbieninvestir.fr/scpi-fonciere-des-praticiens/",
    "GMA Essentialis": "https://commentbieninvestir.fr/scpi-gma-essentialis/",
    "Iroko Atlas": "https://commentbieninvestir.fr/scpi-iroko-atlas/",
    "Iroko Zen": "https://commentbieninvestir.fr/scpi-iroko-zen/",
    "LF Avenir Santé": "https://commentbieninvestir.fr/lf-avenir-sante/",
    "LOG IN": "https://commentbieninvestir.fr/scpi-log-in/",
    "Linaclub": "https://commentbieninvestir.fr/scpi-linaclub/",
    "Mistral Sélection": "https://commentbieninvestir.fr/scpi-mistral-selection/",
    "MomenTime": "https://commentbieninvestir.fr/scpi-momentime/",
    "NCap Continent": "https://commentbieninvestir.fr/scpi-ncap-continent/",
    "NCap Education Santé": "https://commentbieninvestir.fr/scpi-ncap-education-sante/",
    "NCap Régions": "https://commentbieninvestir.fr/scpi-ncap-regions/",
    "Optimale": "https://commentbieninvestir.fr/scpi-optimale/",
    "Osmo Energie": "https://commentbieninvestir.fr/scpi-osmo-energie/",
    "Paref Evo": "https://commentbieninvestir.fr/scpi-paref-evo/",
    "Paref Hexa": "https://commentbieninvestir.fr/scpi-paref-hexa/",
    "Paref Prima": "https://commentbieninvestir.fr/scpi-paref-prima/",
    "Perial Grand Paris": "https://commentbieninvestir.fr/scpi-perial-grand-paris/",
    "Perial Hospitalité Europe": "https://commentbieninvestir.fr/scpi-perial-hospitalite-europe/",
    "Perial O2": "https://commentbieninvestir.fr/scpi-perial-o2/",
    "Perial Opportunités Europe": "https://commentbieninvestir.fr/scpi-perial-opportunites-europe/",
    "Pierval Santé": "https://commentbieninvestir.fr/scpi-pierval-sante/",
    "Primopierre": "https://commentbieninvestir.fr/scpi-primopierre/",
    "Primovie": "https://commentbieninvestir.fr/scpi-primovie/",
    "Reason": "https://commentbieninvestir.fr/scpi-reason/",
    "Remake Live": "https://commentbieninvestir.fr/scpi-remake-live/",
    "Sofidynamic": "https://commentbieninvestir.fr/scpi-sofidynamic/",
    "Transitions Europe": "https://commentbieninvestir.fr/scpi-transitions-europe/",
    "Upeka": "https://commentbieninvestir.fr/scpi-upeka/",
    "Wemo One": "https://commentbieninvestir.fr/scpi-wemo-one/",
}

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def html_to_text(body):
    html = body.decode('utf-8', errors='ignore') if isinstance(body, bytes) else body
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
    html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th)[^>]*>', '\n', html, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html)
    for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&rsquo;',"'"),('&#8217;',"'")]:
        text = text.replace(old, new)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n+', '\n', text)
    return text

def extract_cbi(text):
    d = {}
    
    # D: Capital
    m = re.search(r'Capital\s+(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # G: ISR
    if re.search(r'Label\s+ISR', text, re.I): d['G'] = 'Oui'
    
    # K: Endettement
    m = re.search(r"[Tt]aux\s+d'endettement\s*(?:\(.*?\))?\s*:?\s*(\d+[,.]?\d*)\s*%", text)
    if m: d['K'] = m.group(1) + '%'
    
    # H: Souscription min
    m = re.search(r"(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)", text, re.I)
    if m: d['H'] = m.group(1).strip()
    
    # Q: Jouissance
    m = re.search(r"[Dd][ée]lai\s+de\s+jouissance\s*:?\s*(\d+\s*mois)", text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # L: Parts
    m = re.search(r"[Nn]ombre\s*de\s*parts?\s*:?\s*(\d[\d\s]*)", text)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # M: Parts retrait
    m = re.search(r"(\d[\d\s]*)\s*parts?\s*en\s*attente", text)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # N: RAN
    m = re.search(r"[Rr]eport\s*[àa]\s*nouveau\s*:?\s*(\d+\s*jours?\s*de\s*distribution)", text)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r"[Rr]eport\s*[àa]\s*nouveau\s*:?\s*([\d\s,.]+\s*€)", text)
        if m: d['N'] = m.group(1).strip()
    
    # O: PGE
    m = re.search(r"[Pp]rovision.*?[Gg]ros.*?[Ee]ntretien\s*:?\s*([\d\s,.]+)", text)
    if m: d['O'] = m.group(1).strip()
    
    # P: Fiscalite
    if re.search(r"impôt.*?sociétés.*?allemand|IS\s*15,825", text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r"revenus?\s+fonciers|PFU", text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # R: Freq
    if re.search(r'trimestriel', text, re.I): d['R'] = 'Trimestrielle'
    elif re.search(r'semestriel', text, re.I): d['R'] = 'Semestrielle'
    
    # V: Prix
    m = re.search(r"[Pp]rix\s*de\s*(?:la\s*part|souscription)\s*:?\s*(\d[\d\s,.]*\s*€)", text)
    if m: d['V'] = m.group(1).strip()
    
    # W: Reconstitution
    m = re.search(r"[Vv]aleur\s*de\s*reconstitution\s*:?\s*(\d[\d\s,.]*\s*€)", text)
    if m: d['W'] = m.group(1).strip()
    
    # X: Commission SS
    m = re.search(r"[Ff]rais\s*de\s*souscription\s*:?\s*(\d+[,.]?\d*)\s*%\s*(HT|TTC)", text)
    if m: d['X'] = m.group(1) + ' ' + m.group(2)
    else:
        m = re.search(r"[Ff]rais\s*de\s*souscription\s*:?\s*(\d+[,.]?\d*)\s*%", text)
        if m: d['X'] = m.group(1) + '% TTC'
    
    # Y: Commission gestion
    m = re.search(r"[Ff]rais\s*de\s*gestion\s*:?\s*(\d+[,.]?\d*)\s*%\s*(HT|TTC)", text)
    if m: d['Y'] = m.group(1) + ' ' + m.group(2)
    else:
        m = re.search(r"[Ff]rais\s*de\s*gestion\s*:?\s*(\d+[,.]?\d*)\s*%", text)
        if m: d['Y'] = m.group(1) + '% TTC'
    
    # E/F: Geo/Secteur
    m = re.search(r"[Rr][ée]partition\s*[Gg][ée]ographique[^:]*\n((?:[^\n]+\n){2,12})", text)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    m = re.search(r"[Rr][ée]partition\s*[Ss]ectorielle[^:]*\n((?:[^\n]+\n){2,12})", text)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    # J: Capitalisation
    m = re.search(r"[Cc]apitalisation\s*:?\s*(\d+[,.]?\d*\s*(?:milliards?|Mds?|M)\s*€?)", text)
    if m: d['J'] = m.group(1).strip()
    
    return d

rows = read_sheet()
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi: continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    if empty and scpi in CBI_URLS:
        needs[r] = (scpi, empty)

print(f"Rows with CBI URL and gaps: {len(needs)}")

fetcher = Fetcher(auto_match=False)
total_cells = 0
total_rows = 0

for r, (scpi, empty) in sorted(needs.items()):
    url = CBI_URLS[scpi]
    try:
        page = fetcher.get(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
        if page.status != 200 or not page.body or len(page.body) < 1000:
            continue
        text = html_to_text(page.body)
    except:
        continue
    
    data = extract_cbi(text)
    updates = [(col, data[col]) for col in empty if col in data]
    if not updates: continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.15)
        else:
            print(f"{col}❌ ", end="", flush=True)
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled via CBI")
