#!/usr/bin/env python3
"""
Mass scrape commentbieninvestir.fr for ALL SCPI rows with gaps.
Extracts: Label ISR, Nb parts, Parts retrait, RAN, PGE, Jouissance, Commissions,
          Type capital, Secteur geo, Secteur activite, Fiscalite, Freq distribution
"""
import json, re, subprocess, time, sys
from urllib.request import urlopen, Request

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

# Read current sheet
cmd = [
    'gws', 'sheets', 'spreadsheets', 'values', 'get',
    '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})
]
result = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
sheet = json.loads(result.stdout)
rows = sheet.get('values', [])
headers = rows[0] if rows else []

# Build list of rows that need fixing
needs_fix = []
for i in range(1, len(rows)):
    row = rows[i]
    scpi = row[1] if len(row) > 1 else ''
    if not scpi:
        continue
    
    # Check which columns are empty
    empty_cols = set()
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ['', 'N/A', 'Non trouvé', '-']:
            empty_cols.add(j)
    
    if empty_cols:
        needs_fix.append((i+1, scpi, empty_cols, row[0] if len(row) > 0 else ''))

print(f"Total rows needing fixes: {len(needs_fix)}")

# commentbieninvestir.fr URLs for each SCPI (try multiple slug formats)
def make_slugs(name):
    base = name.lower()
    base = base.replace('é', 'e').replace('è', 'e').replace('ê', 'e').replace('ë', 'e')
    base = base.replace('â', 'a').replace('à', 'a').replace('î', 'i').replace('ï', 'i')
    base = base.replace('ô', 'o').replace('û', 'u').replace('ü', 'u').replace('ç', 'c')
    base = base.replace("'", '').replace('(', '').replace(')', '')
    slugs = [
        f"https://commentbieninvestir.fr/scpi-{base.strip().replace(' ', '-')}/",
        f"https://commentbieninvestir.fr/{base.strip().replace(' ', '-')}/",
    ]
    # Add known aliases
    aliases = {
        'Epargne Pierre': 'epargne-pierre',
        'Accimmo Pierre': 'accimmo-pierre',
        'Opus Real': 'opus-real',
        'Crédit Mutuel Pierre 1': 'credit-mutuel-pierre-1',
        'Epargne Foncière': 'epargne-fonciere',
        'Kyaneos Pierre': 'kyaneos-pierre',
        'Edissimmo': 'edissimmo',
        'Genepierre': 'genepierre',
        'Rivoli Avenir Patrimoine': 'rivoli-avenir-patrimoine',
        'Activimmo': 'activimmo',
        'New Gen': 'newgen-immo',
        'GMA Essentialis': 'gma-essentialis',
        'Affinités Pierre': 'affinites-pierre',
        'Elysées Pierre': 'elysees-pierre',
        'Elysées Grand Large': 'elysees-grand-large',
        'Buroboutic Métropoles': 'buroboutic-metropoles',
        'Ficommerce Proximité': 'ficommerce-proximite',
        'Pierre Expansion Santé': 'pierre-expansion-sante',
        'Selectipierre 2 - Paris': 'selectipierre-2',
        'Novapierre 1': 'novapierre-1',
        'Paref Evo': 'paref-evo',
        'Paref Hexa': 'paref-hexa',
        'Paref Prima': 'paref-prima',
        'Perial Grand Paris': 'perial-grand-paris',
        'Perial Hospitalité Europe': 'perial-hospitalite-europe',
        'Perial O2': 'perial-o2',
        'Perial Opportunités Europe': 'perial-opportunites-europe',
        'Perial Opportunités Territoires': 'perial-opportunites-territoires',
        'Patrimmo Commerce': 'patrimmo-commerce',
        'Praemia Hotels Europe': 'praemia-hotels-europe',
        'Primopierre': 'primopierre',
        'Primovie': 'primovie',
        'Principal Inside': 'principal-inside',
        'Remake Live': 'remake-live',
        'Remake UK 2025': 'remake-uk-2025',
        'Efimmo 1': 'efimmo-1',
        'Immorente': 'immorente',
        'Sofiboutique': 'sofiboutique',
        'Sofidy Europe Invest': 'sofidy-europe-invest',
        'Sofidynamic': 'sofidynamic',
        'Sofipierre': 'sofipierre',
        'Coeur d\'Avenir': 'coeur-d-avenir',
        'Coeur d\'Europe': 'coeur-d-europe',
        'Coeur de régions': 'coeur-de-regions',
        'Coeur de ville': 'coeur-de-ville',
        'Esprit Horizon': 'esprit-horizon',
        'ESG Pierre Capitale': 'pierre-capitale',
        'Mistral Sélection': 'mistral-selection',
        'Telamon Borea': 'telamon-borea',
        'LOG IN': 'login-theoreim',
        'Urban Coeur Commerce': 'urban-coeur-commerce',
        'Wemo One': 'wemo-one',
        'Foncière Des Praticiens': 'fonciere-des-praticiens',
        'Osmo Energie': 'osmo-energie',
        'Novaxia Neo': 'novaxia-neo',
        'Atout Pierre Diversification': 'atout-pierre-diversification',
        'AEW Commerces Europe': 'aew-commerces-europe',
        'AEW Diversification Allemagne': 'aew-diversification-allemagne',
        'AEW Opportunités Europe': 'aew-opportunites-europe',
        'AEW Patrimoine Santé': 'aew-patrimoine-sante',
        'LF Croissance et Territoires': 'lf-croissance-et-territoires',
        'LF Europimmo': 'lf-europimmo',
        'LF Grand Paris Patrimoine': 'lf-grand-paris-patrimoine',
        'LF Opportunité Immo': 'lf-opportunite-immo',
        'Selectinvest 1': 'selectinvest-1',
        'Reason': 'reason',
        'My Share Education': 'my-share-education',
        'My Share SCPI': 'my-share-scpi',
        'NCap Continent': 'ncap-continent',
        'NCap Education Santé': 'ncap-education-sante',
        'NCap Régions': 'ncap-regions',
    }
    if name in aliases:
        slugs.insert(0, f"https://commentbieninvestir.fr/scpi-{aliases[name]}/")
        slugs.insert(1, f"https://commentbieninvestir.fr/{aliases[name]}/")
    return slugs

def fetch_page(url):
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
        resp = urlopen(req, timeout=15)
        html = resp.read().decode('utf-8', errors='ignore')
        html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
        html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
        html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
        html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th|dd|dt)[^>]*>', '\n', html, flags=re.I)
        text = re.sub(r'<[^>]+>', ' ', html)
        text = re.sub(r'&nbsp;', ' ', text)
        text = re.sub(r'&amp;', '&', text)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n\s*\n', '\n', text)
        return text
    except:
        return None

def extract_data(text):
    d = {}
    
    # Label ISR
    if re.search(r'[Ll]abel ISR', text): d['G'] = 'Oui'
    
    # Type capital
    m = re.search(r'[Cc]apital\s*(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # Secteur geo - look for "Repartition Geographique" section
    m = re.search(r'[Rr][ée]partition [Gg][ée]ographique.*?\n((?:.*?\n){3,10})', text)
    if m:
        geo = m.group(1).strip()
        # Extract country percentages
        countries = re.findall(r'([A-Z][\wéèêâàîïôûüç\s]+?)\s+(\d+[,.]?\d*\s*%)', geo)
        if countries:
            d['E'] = ', '.join(f'{c.strip()} {p}' for c, p in countries[:5])
    
    # Secteur activite
    m = re.search(r'[Rr][ée]partition [Ss]ectorielle.*?\n((?:.*?\n){3,10})', text)
    if m:
        sect = m.group(1).strip()
        sectors = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*\s*%)', sect)
        if sectors:
            d['F'] = ', '.join(f'{s.strip()} {p}' for s, p in sectors[:5])
    
    # Fiscalite
    m = re.search(r'(?:fiscalit[ée]|imposition)[^.]*?(?:revenus fonciers|PFU|pr[ée]l[ée]vement|micro-foncier|micro-bnc)', text, re.I)
    if m:
        snippet = text[m.start():m.start()+200]
        if 'allemand' in snippet.lower() or 'is 15' in snippet.lower():
            d['P'] = 'Allemande (IS 15,825%)'
        elif 'source [ée]trang' in snippet.lower():
            d['P'] = 'Revenus source étrangère (conventions fiscales)'
        else:
            d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # Souscription min
    m = re.search(r'[Ii]nvestissement minimum\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['H'] = m.group(1).strip()
    
    # Nombre de parts
    m = re.search(r'[Nn]ombre (?:d\'|de )parts?\s*(?:émises|en circulation)?\s*\n?\s*:?\s*(\d[\d\s]*)', text)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # Parts en attente retrait
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*attente', text)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # RAN
    m = re.search(r'[Rr]eport [àa] nouveau\s*\n\s*(\d+\s*jours?\s*de\s*distribution)', text)
    if m: d['N'] = m.group(1).strip()
    if 'N' not in d:
        m = re.search(r'[Rr]eport [àa] nouveau\s*\n?\s*:?\s*([\d\s,.]+€)', text)
        if m: d['N'] = m.group(1).strip()
    
    # PGE / Provision gros entretien
    m = re.search(r'[Pp]rovision (?:pour )?[Gg]ros [Ee]ntretien\s*\n?\s*:?\s*([\d\s,.]+€)', text)
    if m: d['O'] = m.group(1).strip()
    
    # Delai jouissance
    m = re.search(r'[Dd][ée]lai de jouissance\s*\n?\s*:?\s*(\d+\s*mois)', text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # Frequence distribution
    m = re.search(r'[Ff]r[ée]quence de distribution\s*\n?\s*:?\s*(\w+)', text, re.I)
    if m: d['R'] = m.group(1).strip().capitalize()
    if 'R' not in d:
        if re.search(r'4\s*acomptes?\s*trimestriel', text, re.I): d['R'] = 'Trimestrielle'
        elif re.search(r'acompte.*semestriel', text, re.I): d['R'] = 'Semestrielle'
    
    # TOF
    m = re.search(r'(?:TOF|Taux d\'[Oo]ccupation [Ff]inancier)\s*(?:\(.*?\))?\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['S'] = m.group(1) + '%'
    
    # PGA
    m = re.search(r'[Pp]erformance globale annuelle\s*(?:2025|\s)*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['T'] = m.group(1) + '%'
    
    # TRI 10 ans
    m = re.search(r'TRI\s*10\s*ans\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['U'] = m.group(1) + '%'
    
    # Val souscription
    m = re.search(r'[Pp]rix de (?:la part|souscription)\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['V'] = m.group(1).strip()
    
    # Val reconstitution
    m = re.search(r'[Vv]aleur de reconstitution\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['W'] = m.group(1).strip()
    
    # Commission souscription
    m = re.search(r'[Ff]rais de souscription\s*\n\s*(\d+[,.]?\d*%)\s*(?:HT|TTC)', text)
    if m:
        pct = m.group(1)
        rest = text[m.end():m.end()+20]
        d['X'] = pct + (' TTC' if 'TTC' in rest else ' HT')
    
    # Commission gestion
    m = re.search(r'[Ff]rais de gestion\s*\n\s*(\d+[,.]?\d*%)\s*(?:HT|TTC)', text)
    if m:
        pct = m.group(1)
        rest = text[m.end():m.end()+20]
        d['Y'] = pct + (' TTC' if 'TTC' in rest else ' HT')
    
    # Endettement
    m = re.search(r'[Tt]aux d\'endettement\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['K'] = m.group(1) + '%'
    
    # Capitalisation
    m = re.search(r'[Cc]apitalisation\s*\n\s*(\d+[,.]?\d*\s*(?:milliards?|Mds?|M|Md)\s*€)', text)
    if m: d['J'] = m.group(1).strip()
    
    return d

def gws_update(range_str, values):
    cmd = [
        'gws', 'sheets', 'spreadsheets', 'values', 'update',
        '--params', json.dumps({
            "spreadsheetId": SHEET_ID,
            "range": f"Listing sites!{range_str}",
            "valueInputOption": "USER_ENTERED"
        }),
        '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [values]})
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in result.stdout

total_updated = 0
total_cells = 0

for row_num, scpi_name, empty_cols, url in needs_fix:
    slugs = make_slugs(scpi_name)
    text = None
    
    for slug_url in slugs:
        text = fetch_page(slug_url)
        if text and len(text) > 1000:
            break
        text = None
    
    if not text:
        # Try ideal-investisseur as fallback
        # Already scraped earlier, skip
        continue
    
    data = extract_data(text)
    if not data:
        continue
    
    # Only fill columns that are currently empty
    cells_to_update = {}
    for col_idx in empty_cols:
        col_letter = chr(65 + col_idx)
        if col_letter in data:
            cells_to_update[col_letter] = data[col_letter]
    
    if not cells_to_update:
        continue
    
    # Update each cell individually
    print(f"Row {row_num} ({scpi_name}): ", end="", flush=True)
    row_updated = 0
    for col_letter, value in cells_to_update.items():
        if gws_update(f"{col_letter}{row_num}", value):
            print(f"{col_letter}✅ ", end="", flush=True)
            row_updated += 1
            time.sleep(0.2)
        else:
            print(f"{col_letter}❌ ", end="", flush=True)
    
    print(f"({row_updated} cells)")
    total_updated += 1
    total_cells += row_updated
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_updated} rows updated, {total_cells} cells filled")
