#!/usr/bin/env python3
"""
Pass 7: Build complete II URL map from sitemap, then mass-scrape all remaining gaps.
"""
import json, re, subprocess, time, sys
from urllib.request import urlopen, Request
sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"
II_BASE = "https://www.ideal-investisseur.fr"

# Build URL map from sitemap
req = Request("https://www.ideal-investisseur.fr/sitemap.xml", headers={'User-Agent': 'Mozilla/5.0'})
resp = urlopen(req, timeout=15)
xml = resp.read().decode('utf-8')
all_urls = re.findall(r'<loc>(https://www\.ideal-investisseur\.fr/scpi-avis/[^<]+)</loc>', xml)

# Extract SCPI name from URL slug
def slug_to_name(url):
    """Extract SCPI name from URL like /scpi-avis/accimmo-pierre-1050.html"""
    m = re.search(r'/scpi-avis/(.+?)-\d+\.html', url)
    if m:
        slug = m.group(1)
        return slug.replace('-', ' ').title()

url_map = {}
for url in all_urls:
    name = slug_to_name(url)
    if name:
        url_map[name] = url

# Manual overrides for name mismatches
manual_map = {
    "Aestiam Agora": "https://www.ideal-investisseur.fr/scpi-avis/aestiam-agora-1028.html",
    "Aestiam Horizon": "https://www.ideal-investisseur.fr/scpi-avis/aestiam-horizon-1071.html",
    "AEW Commerces Europe": "https://www.ideal-investisseur.fr/scpi-avis/aew-commerces-europe-10058.html",
    "AEW Diversification Allemagne": "https://www.ideal-investisseur.fr/scpi-avis/aew-diversification-allemagne-10025.html",
    "AEW Opportunités Europe": "https://www.ideal-investisseur.fr/scpi-avis/aew-opportunites-europe-10026.html",
    "AEW Patrimoine Santé": "https://www.ideal-investisseur.fr/scpi-avis/aew-patrimoine-sante-10027.html",
    "Atout Pierre Diversification": "https://www.ideal-investisseur.fr/scpi-avis/atout-pierre-diversification-1146.html",
    "Edissimmo": "https://www.ideal-investisseur.fr/scpi-avis/edissimmo-1054.html",
    "Genepierre": "https://www.ideal-investisseur.fr/scpi-avis/genepierre-1077.html",
    "Rivoli Avenir Patrimoine": "https://www.ideal-investisseur.fr/scpi-avis/scpi-rivoli-avenir-patrimoine-1025.html",
    "Epargne Pierre": "https://www.ideal-investisseur.fr/scpi-avis/epargne-pierre-10014.html",
    "Epargne Pierre Europe": "https://www.ideal-investisseur.fr/scpi-avis/epargne-pierre-europe-10143.html",
    "Epargne Pierre Sophia": "https://www.ideal-investisseur.fr/scpi-avis/epargne-pierre-sophia-10144.html",
    "Accès Valeur Pierre": "https://www.ideal-investisseur.fr/scpi-avis/scpi-acces-valeur-pierre-1029.html",
    "Accimmo Pierre": "https://www.ideal-investisseur.fr/scpi-avis/accimmo-pierre-1050.html",
    "Opus Real": "https://www.ideal-investisseur.fr/scpi-avis/opus-real-bnp-reim-1119.html",
    "Edmond de Rothschild Europa": "https://www.ideal-investisseur.fr/scpi-avis/edr-europa-10133.html",
    "Atream Hôtels": "https://www.ideal-investisseur.fr/scpi-avis/scpi-atream-hotels-1032.html",
    "Buroboutic Métropoles": "https://www.ideal-investisseur.fr/scpi-avis/buroboutic-metropoles-10116.html",
    "Ficommerce Proximité": "https://www.ideal-investisseur.fr/scpi-avis/ficommerce-proximite-10115.html",
    "Logipierre 3 Résidences Services": "https://www.ideal-investisseur.fr/scpi-avis/logipierre-residences-services-10118.html",
    "Pierre Expansion Santé": "https://www.ideal-investisseur.fr/scpi-avis/pierre-expansion-sante-10117.html",
    "Selectipierre 2 - Paris": "https://www.ideal-investisseur.fr/scpi-avis/selectipierre-2-1072.html",
    "Affinités Pierre": "https://www.ideal-investisseur.fr/scpi-avis/scpi-affinites-pierre-1031.html",
    "Attraits Pierre": "https://www.ideal-investisseur.fr/scpi-avis/attraits-pierre-10141.html",
    "Elysées Grand Large": "https://www.ideal-investisseur.fr/scpi-avis/elysees-grand-large-10103.html",
    "Elysées Pierre": "https://www.ideal-investisseur.fr/scpi-avis/elysees-pierre-1041.html",
    "Kyaneos Pierre": "https://www.ideal-investisseur.fr/scpi-avis/kyaneos-denormandie-3-10032.html",
    "Crédit Mutuel Pierre 1": "https://www.ideal-investisseur.fr/scpi-avis/credit-mutuel-pierre-1-1065.html",
    "Epargne Foncière": "https://www.ideal-investisseur.fr/scpi-avis/scpi-epargne-fonciere-1001.html",
    "LF Avenir Santé": "https://www.ideal-investisseur.fr/scpi-avis/lf-avenir-sante-1062.html",
    "LF Croissance et Territoires": "https://www.ideal-investisseur.fr/scpi-avis/lf-croissance-et-territoires-10152.html",
    "LF Europimmo": "https://www.ideal-investisseur.fr/scpi-avis/lf-europimmo-1129.html",
    "LF Grand Paris Patrimoine": "https://www.ideal-investisseur.fr/scpi-avis/lf-grand-paris-patrimoine-1114.html",
    "LF Opportunité Immo": "https://www.ideal-investisseur.fr/scpi-avis/lf-opportunite-immo-1115.html",
    "Selectinvest 1": "https://www.ideal-investisseur.fr/scpi-avis/selectinvest-1-1126.html",
    "Foncière Des Praticiens": "https://www.ideal-investisseur.fr/scpi-avis/fonciere-des-praticiens-1144.html",
    "Osmo Energie": "https://www.ideal-investisseur.fr/scpi-avis/osmo-energie-1147.html",
    "My Share Education": "https://www.ideal-investisseur.fr/scpi-avis/my-share-education-10124.html",
    "My Share SCPI": "https://www.ideal-investisseur.fr/scpi-avis/my-share-scpi-1145.html",
    "Novaxia Neo": "https://www.ideal-investisseur.fr/scpi-avis/novaxia-neo-1099.html",
    "Novapierre 1": "https://www.ideal-investisseur.fr/scpi-avis/novapierre-1075.html",
    "Paref Evo": "https://www.ideal-investisseur.fr/scpi-avis/paref-evo-10162.html",
    "Paref Hexa": "https://www.ideal-investisseur.fr/scpi-avis/paref-hexa-10164.html",
    "Paref Prima": "https://www.ideal-investisseur.fr/scpi-avis/paref-prima-10163.html",
    "Perial Grand Paris": "https://www.ideal-investisseur.fr/scpi-avis/pf-grand-paris-1067.html",
    "Perial Hospitalité Europe": "https://www.ideal-investisseur.fr/scpi-avis/pf-hospitalite-europe-1070.html",
    "Perial O2": "https://www.ideal-investisseur.fr/scpi-avis/pf-o2-1068.html",
    "Perial Opportunités Europe": "https://www.ideal-investisseur.fr/scpi-avis/perial-opportunites-europe-1066.html",
    "Perial Opportunités Territoires": "https://www.ideal-investisseur.fr/scpi-avis/perial-opportunites-territoires-1065.html",
    "Patrimmo Commerce": "https://www.ideal-investisseur.fr/scpi-avis/patrimmo-commerce-10110.html",
    "Praemia Hotels Europe": "https://www.ideal-investisseur.fr/scpi-avis/praemia-hotels-europe-1131.html",
    "Primopierre": "https://www.ideal-investisseur.fr/scpi-avis/primopierre-1081.html",
    "Primovie": "https://www.ideal-investisseur.fr/scpi-avis/scpi-primovie-primonial-1024.html",
    "Principal Inside": "https://www.ideal-investisseur.fr/scpi-avis/principal-inside-1132.html",
    "Remake Live": "https://www.ideal-investisseur.fr/scpi-avis/remake-live-10017.html",
    "Remake UK 2025": "https://www.ideal-investisseur.fr/scpi-avis/remake-uk-1127.html",
    "Efimmo 1": "https://www.ideal-investisseur.fr/scpi-avis/efimmo-1-1063.html",
    "Immorente": "https://www.ideal-investisseur.fr/scpi-avis/scpi-immorente-1026.html",
    "Sofiboutique": "https://www.ideal-investisseur.fr/scpi-avis/sofiboutique-10023.html",
    "Sofidy Europe Invest": "https://www.ideal-investisseur.fr/scpi-avis/sofidy-europe-invest-10129.html",
    "Sofidynamic": "https://www.ideal-investisseur.fr/scpi-avis/sofidynamic-10128.html",
    "Sofipierre": "https://www.ideal-investisseur.fr/scpi-avis/sofipierre-10020.html",
    "Coeur d'Avenir": "https://www.ideal-investisseur.fr/scpi-avis/coeur-d-avenir-1102.html",
    "Coeur d'Europe": "https://www.ideal-investisseur.fr/scpi-avis/coeur-d-europe-sogenial-10042.html",
    "Coeur de régions": "https://www.ideal-investisseur.fr/scpi-avis/sogenial-coeur-de-regions-1103.html",
    "Coeur de ville": "https://www.ideal-investisseur.fr/scpi-avis/coeur-de-ville-10044.html",
    "Esprit Horizon": "https://www.ideal-investisseur.fr/scpi-avis/esprit-horizon-10136.html",
    "ESG Pierre Capitale": "https://www.ideal-investisseur.fr/scpi-avis/esg-pierre-capitale-1134.html",
    "Mistral Sélection": "https://www.ideal-investisseur.fr/scpi-avis/mistral-selection-10130.html",
    "Telamon Borea": "https://www.ideal-investisseur.fr/scpi-avis/borea-1106.html",
    "LOG IN": "https://www.ideal-investisseur.fr/scpi-avis/log-in-theroreim-10036.html",
    "Urban Coeur Commerce": "https://www.ideal-investisseur.fr/scpi-avis/urban-vitalim-3-1141.html",
    "Wemo One": "https://www.ideal-investisseur.fr/scpi-avis/wemo-one-10131.html",
    "New Gen": "https://www.ideal-investisseur.fr/scpi-avis/newgen-immo-10021.html",
    "Transitions Europe": "https://www.ideal-investisseur.fr/scpi-avis/transitions-europe-10040.html",
    "Altixia Commerces": "https://www.ideal-investisseur.fr/scpi-avis/altixia-commerces-1006.html",
    "Allianz Pierre": "https://www.ideal-investisseur.fr/scpi-avis/allianz-pierre-1030.html",
    "Comète": "https://www.ideal-investisseur.fr/scpi-avis/comete-alderan-10045.html",
    "Upeka": "https://www.ideal-investisseur.fr/scpi-avis/upeka-10147.html",
    "MomenTime": "https://www.ideal-investisseur.fr/scpi-avis/MomenTime-10149.html",
    "Eden": "https://www.ideal-investisseur.fr/scpi-avis/eden-10150.html",
    "Elialys": "https://www.ideal-investisseur.fr/scpi-avis/elialys-1003.html",
    "Eurovalys": "https://www.ideal-investisseur.fr/scpi-avis/eurovalys-1002.html",
    "Linaclub": "https://www.ideal-investisseur.fr/scpi-avis/linaclub-1004.html",
}

# Merge
url_map.update(manual_map)

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def html_to_text(body):
    html = body.decode('utf-8', errors='ignore') if isinstance(body, bytes) else body
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
    html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th)[^>]*>', '\n', html, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html)
    for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&rsquo;',"'"),('&#8217;',"'"),('&#8211;','-'),('&eacute;','é'),('&egrave;','è'),('&agrave;','à'),('&ocirc;','ô')]:
        text = text.replace(old, new)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n+', '\n', text)
    return text

def extract_ii(text):
    d = {}
    
    # G: ISR
    m = re.search(r'Label\s+ISR\s*(Non|Oui)', text, re.I)
    if m: d['G'] = 'Oui' if m.group(1).lower() not in ('non', 'no') else 'Non'
    
    # D: Capital
    m = re.search(r'Capital\s+(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # K: Endettement
    m = re.search(r"Taux\s+d'endettement\s+(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['K'] = m.group(1) + '%'
    
    # S: TOF
    m = re.search(r'\bTOF\s+(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['S'] = m.group(1) + '%'
    
    # V: Prix souscription
    m = re.search(r'Prix\s+de\s+souscription\s+(\d[\d\s,.]*€)', text, re.I)
    if m: d['V'] = m.group(1).strip()
    
    # W: Valeur reconstitution
    m = re.search(r'Valeur\s+de\s+reconstitution\s+(\d[\d\s,.]*€)', text, re.I)
    if m: d['W'] = m.group(1).strip()
    
    # H: Souscription min
    m = re.search(r'(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['H'] = m.group(1).strip()
    
    # Q: Jouissance
    m = re.search(r'(?:Délai|Durée)\s+de\s+jouissance\s*:?\s*(\d+\s*mois)', text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # L: Parts
    m = re.search(r'(?:Nombre\s+de\s+parts|Parts?\s+émises?)\s*:?\s*(\d[\d\s]*)', text, re.I)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # M: Parts retrait
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*(?:attente\s*de\s*)?retrait', text, re.I)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # N: RAN
    m = re.search(r'Report\s+à\s+nouveau\s*:?\s*(\d+\s*jours?\s*de\s*distribution)', text, re.I)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r'Report\s+à\s+nouveau\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
        if m: d['N'] = m.group(1).strip()
    
    # O: PGE
    m = re.search(r'(?:Provision|Réserve)\s+pour\s+gros\s+entretien\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
    if m: d['O'] = m.group(1).strip()
    
    # P: Fiscalite
    if re.search(r"impôt.*?sociétés.*?allemand|IS\s*15,825", text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r"revenus?\s+fonciers|PFU|prélèvement\s+forfaitaire", text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # R: Freq
    if re.search(r'trimestriel', text, re.I): d['R'] = 'Trimestrielle'
    elif re.search(r'semestriel', text, re.I): d['R'] = 'Semestrielle'
    
    # T: TD/PGA
    m = re.search(r'Taux\s+de\s+distribution\s*(?:\d{4})?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['T'] = m.group(1) + '%'
    
    # U: TRI
    m = re.search(r'TRI\s+(?:à\s+)?10\s+ans?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['U'] = m.group(1) + '%'
    
    # X: Commission SS
    m = re.search(r'(?:Commission|Frais)\s+de\s+souscription\s+(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['X'] = m.group(1) + '% TTC'
    
    # Y: Commission gestion
    m = re.search(r'(?:Commission|Frais)\s+de\s+gestion\s+(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['Y'] = m.group(1) + '% TTC'
    
    # E: Geo
    m = re.search(r'Répartition\s*géographique\s*\n((?:[^\n]+\n){2,15})', text, re.I)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    # F: Secteur
    m = re.search(r'(?:Typologie|Répartition)\s*(?:des actifs|sectorielle)\s*\n((?:[^\n]+\n){2,15})', text, re.I)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    # I: Date
    m = re.search(r'Création\s*:?\s*(\w+\s+\d{4})', text, re.I)
    if m: d['I'] = m.group(1).strip()
    
    # J: Cap
    m = re.search(r'Capitalisation\s+(\d+[,.]?\d*)\s*(M€|Mds?€|milliards?\s*€?)', text, re.I)
    if m: d['J'] = m.group(1) + ' ' + m.group(2)
    
    return d

# Read sheet
rows = read_sheet()
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi: continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    if empty:
        needs[r] = (scpi, empty)

print(f"Rows needing fixes: {len(needs)}")
print(f"URLs available in map: {len(url_map)}")

# Check how many match
matched = sum(1 for r, (scpi, _) in needs.items() if scpi in url_map)
print(f"Matched URLs: {matched}/{len(needs)}")

fetcher = Fetcher(auto_match=False)
total_cells = 0
total_rows = 0
no_data = []

for r, (scpi, empty) in sorted(needs.items()):
    if scpi not in url_map:
        no_data.append(scpi)
        continue
    
    url = url_map[scpi]
    try:
        page = fetcher.get(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
        if page.status != 200 or not page.body or len(page.body) < 1000:
            no_data.append(f"{scpi} (HTTP {page.status})")
            continue
        text = html_to_text(page.body)
    except Exception as e:
        no_data.append(f"{scpi} (error)")
        continue
    
    data = extract_ii(text)
    updates = [(col, data[col]) for col in empty if col in data]
    if not updates:
        no_data.append(f"{scpi} (no match)")
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.15)
        else:
            print(f"{col}❌ ", end="", flush=True)
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled")
print(f"\nNo URL/data ({len(no_data)}): {', '.join(no_data[:15])}")
