#!/usr/bin/env python3
"""Use web_fetch via exec to get readability-extracted content for each SCPI"""
import json, subprocess, re, time

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

SCPI_PAGES = {
    70: "https://commentbieninvestir.fr/scpi-fonciere-des-praticiens-avis-analyse-et-souscription/",
    71: "https://commentbieninvestir.fr/scpi-osmo-energie/",
    80: "https://commentbieninvestir.fr/paref-evo/",
    81: "https://commentbieninvestir.fr/scpi-paref-hexa/",
    83: "https://commentbieninvestir.fr/scpi-perial-grand-paris/",
    84: "https://commentbieninvestir.fr/scpi-perial-hospitalite-europe/",
    85: "https://commentbieninvestir.fr/scpi-perial-o2/",
    86: "https://commentbieninvestir.fr/scpi-perial-opportunites-europe/",
    90: "https://commentbieninvestir.fr/scpi-primopierre/",
    91: "https://commentbieninvestir.fr/scpi-primovie/",
    99: "https://commentbieninvestir.fr/scpi-sofidynamic-avis-analyse/",
    101: "https://commentbieninvestir.fr/scpi-coeur-davenir-avis-et-analyse/",
    105: "https://commentbieninvestir.fr/scpi-esprit-horizon/",
    111: "https://commentbieninvestir.fr/scpi-wemo-one/",
}

def extract_from_text(text):
    """Extract SCPI data from commentbieninvestir readability text"""
    data = {}
    
    # Distribution / TD
    m = re.search(r'[Tt]aux de distribution\s*(?:2025|cible|\s)*\n\s*(\d+[,.]\d+)\s*%', text)
    if not m:
        m = re.search(r'[Dd]istribution\s*2025\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m:
        data['td'] = m.group(1) + '%'
    
    # PGA
    m = re.search(r'[Pp]erformance globale annuelle\s*(?:2025|\s)*\n\s*(\d+[,.]\d+)\s*%', text)
    if m:
        data['pga'] = m.group(1) + '%'
    
    # TRI 10 ans
    m = re.search(r'TRI\s*10\s*ans\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m:
        data['tri_10'] = m.group(1) + '%'
    
    # TOF
    m = re.search(r'(?:TOF|Taux d\'[Oo]ccupation [Ff]inancier)\s*(?:\(.*?\))?\s*(?:T\d\s*\d{4}|\s)*\n\s*(\d+[,.]\d+)\s*%', text)
    if m:
        data['tof'] = m.group(1) + '%'
    
    # Capitalisation
    m = re.search(r'[Cc]apitalisation\s*\n\s*(\d+[,.]?\d*\s*(?:milliards?|Mds?|M|Md)\s*€)', text)
    if m:
        data['capitalisation'] = m.group(1).strip()
    
    # Prix souscription
    m = re.search(r'[Pp]rix de (?:la part|souscription)\s*\n\s*(\d[\d\s,.]*€)', text)
    if m:
        data['prix_souscription'] = m.group(1).strip()
    
    # Valeur reconstitution
    m = re.search(r'[Vv]aleur de reconstitution\s*\n\s*(\d[\d\s,.]*€)', text)
    if m:
        data['val_reconstitution'] = m.group(1).strip()
    
    # Endettement
    m = re.search(r'[Tt]aux d\'endettement\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m:
        data['endettement'] = m.group(1) + '%'
    
    # Date creation
    m = re.search(r'[Cc]r[ée]\w*\s*(?:en\s*)?(\w+\s+\d{4}|\d{4})', text)
    if m:
        data['date_creation'] = m.group(1).strip()
    
    # Parts en attente
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*attente', text)
    if m:
        data['parts_retrait'] = m.group(1).strip().replace(' ', '')
    
    # Souscription min
    m = re.search(r'[Ii]nvestissement minimum\s*\n\s*(\d[\d\s,.]*€)', text)
    if m:
        data['souscription_min'] = m.group(1).strip()
    
    # Label ISR
    if re.search(r'[Ll]abel ISR', text):
        data['label_isr'] = 'Oui'
    
    # Frais souscription
    m = re.search(r'[Ff]rais de souscription\s*\n\s*(\d+[,.]?\d*%)\s*(?:HT|TTC)', text)
    if m:
        pct = m.group(1)
        rest = text[m.end():m.end()+10]
        data['comm_souscription'] = pct + (' TTC' if 'TTC' in rest or 'HT' in rest else '')
    
    # Frais gestion
    m = re.search(r'[Ff]rais de gestion\s*\n\s*(\d+[,.]?\d*%)\s*(?:HT|TTC)', text)
    if m:
        pct = m.group(1)
        rest = text[m.end():m.end()+10]
        data['comm_gestion'] = pct + (' TTC' if 'TTC' in rest or 'HT' in rest else '')
    
    # RAN
    m = re.search(r'[Rr]eport [àa] nouveau\s*\n\s*(\d+\s*jours?\s*de\s*distribution)', text)
    if m:
        data['ran'] = m.group(1).strip()
    
    return data

def gws_update(range_str, values):
    cmd = [
        'gws', 'sheets', 'spreadsheets', 'values', 'update',
        '--params', json.dumps({
            "spreadsheetId": SHEET_ID,
            "range": f"Listing sites!{range_str}",
            "valueInputOption": "USER_ENTERED"
        }),
        '--json', json.dumps({
            "range": f"Listing sites!{range_str}",
            "values": [values]
        })
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in result.stdout

all_results = {}
for row_num, url in SCPI_PAGES.items():
    print(f"\nRow {row_num}...", end=" ", flush=True)
    try:
        # Use curl + readability-like extraction via Scrapling Fetcher
        # Actually, use the python readability approach
        from urllib.request import urlopen, Request
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
        resp = urlopen(req, timeout=15)
        html = resp.read().decode('utf-8', errors='ignore')
        
        # Quick HTML to text
        # Remove scripts/styles
        html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
        html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
        # Keep line breaks
        html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
        html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th|dd|dt)[^>]*>', '\n', html, flags=re.I)
        # Strip remaining tags
        text = re.sub(r'<[^>]+>', ' ', html)
        text = re.sub(r'&nbsp;', ' ', text)
        text = re.sub(r'&amp;', '&', text)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n\s*\n', '\n', text)
        
        data = extract_from_text(text)
        all_results[row_num] = data
        
        print(f"({len(data)} fields)")
        if data:
            for k, v in data.items():
                print(f"  {k}: {v}")
        time.sleep(0.5)
    except Exception as e:
        print(f"ERROR: {e}")

# Update sheet
print("\n\n=== UPDATING SHEET ===")
updated = 0
for row_num, data in all_results.items():
    if len(data) < 3:
        print(f"SKIP Row {row_num} — {len(data)} fields")
        continue
    
    vals = [''] * 22
    if 'label_isr' in data: vals[3] = data['label_isr']
    if 'souscription_min' in data: vals[4] = data['souscription_min']
    if 'date_creation' in data: vals[5] = data['date_creation']
    if 'capitalisation' in data: vals[6] = data['capitalisation']
    if 'endettement' in data: vals[7] = data['endettement']
    if 'parts_retrait' in data: vals[9] = data['parts_retrait']
    if 'ran' in data: vals[10] = data['ran']
    if 'tof' in data: vals[15] = data['tof']
    if 'pga' in data: vals[16] = data['pga']
    if 'tri_10' in data: vals[17] = data['tri_10']
    if 'prix_souscription' in data: vals[18] = data['prix_souscription']
    if 'val_reconstitution' in data: vals[19] = data['val_reconstitution']
    if 'comm_souscription' in data: vals[20] = data['comm_souscription']
    if 'comm_gestion' in data: vals[21] = data['comm_gestion']
    
    non_empty = sum(1 for v in vals if v)
    print(f"Row {row_num}: {non_empty} fields → ", end="", flush=True)
    
    if gws_update(f"D{row_num}:Y{row_num}", vals):
        print("✅")
        updated += 1
        time.sleep(0.3)
    else:
        print("❌")

print(f"\n✅ Updated {updated} rows")
