#!/usr/bin/env python3
"""
SCPI Batch Filler — Master script for scpi-scraper agent.
Uses ideal-investisseur JSON extraction + commentbieninvestir as fallback.
Processes ALL rows with gaps.

KEY DISCOVERY: ideal-investisseur pages contain structured JSON in 
<script type="application/json"> blocks with ALL financial data.
"""
import json, re, subprocess, time, sys, os

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

# === STEP 1: Read current sheet ===
def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

# === STEP 2: Build sitemap-based URL map ===
print("Building URL map from sitemap...")
from urllib.request import urlopen, Request
try:
    req = Request("https://www.ideal-investisseur.fr/sitemap.xml", headers={'User-Agent': 'Mozilla/5.0'})
    resp = urlopen(req, timeout=15)
    xml = resp.read().decode('utf-8')
    sitemap_urls = re.findall(r'<loc>(https://www\.ideal-investisseur\.fr/scpi-avis/[^<]+)</loc>', xml)
    print(f"  Found {len(sitemap_urls)} URLs in sitemap")
except Exception as e:
    sitemap_urls = []
    print(f"  Sitemap error: {e}")

# Manual URL map (verified)
MANUAL_URLS = {
    "Eden": 10150, "Elialys": 1003, "Eurovalys": 1002, "Aestiam Agora": 1028,
    "Aestiam Horizon": 1071, "Linaclub": 1004, "AEW Commerces Europe": 10058,
    "AEW Diversification Allemagne": 10025, "AEW Opportunités Europe": 10026,
    "AEW Patrimoine Santé": 10027, "Atout Pierre Diversification": 1146,
    "Activimmo": 10009, "Comète": 10045, "Allianz Pierre": 1030,
    "Alta Convictions": 10153, "Altixia Cadence XII": 1004,
    "Altixia Commerces": 1006, "Edissimmo": 1054, "Genepierre": 1077,
    "Rivoli Avenir Patrimoine": 1025, "MomenTime": 10149,
    "Transitions Europe": 10040, "New Gen": 10021,
    "Epargne Pierre": 10014, "Epargne Pierre Europe": 10143,
    "Epargne Pierre Sophia": 10144, "Atream Hôtels": 1032,
    "Upeka": 10147, "Accès Valeur Pierre": 1029, "Accimmo Pierre": 1050,
    "Imarea Pierre": 10111, "Opus Real": 1119, "Optimale": 10005,
    "Corum Eurion": 10000, "Corum Origin": 1000, "Corum USA": 1001,
    "Corum XL": 1097, "Darwin RE01": 10138,
    "Edmond de Rothschild Europa": 10133, "Elevation Tertiom": 10137,
    "Epsicap Explore": 10019, "Epsicap Nano": 10034,
    "Euryale Horizons Santé": 10035, "Pierval Santé": 1055,
    "Buroboutic Métropoles": 10116, "Ficommerce Proximité": 10115,
    "Logipierre 3 Résidences Services": 10118,
    "Pierre Expansion Santé": 10117, "Selectipierre 2 - Paris": 1072,
    "Cap Foncières et Territoires": 1045, "GMA Essentialis": 10008,
    "Affinités Pierre": 1031, "Attraits Pierre": 10141,
    "Elysées Grand Large": 10103, "Elysées Pierre": 1041,
    "Cristal Life": 10033, "Cristal Rente": 10033,
    "Iroko Atlas": 10006, "Iroko Zen": 10007,
    "Kyaneos Pierre": 10032, "Crédit Mutuel Pierre 1": 1065,
    "Epargne Foncière": 1001, "LF Avenir Santé": 1062,
    "LF Croissance et Territoires": 10152, "LF Europimmo": 1129,
    "LF Grand Paris Patrimoine": 1114, "LF Opportunité Immo": 1115,
    "Selectinvest 1": 1126, "Foncière Des Praticiens": 1144,
    "Osmo Energie": 1147, "Reason": 10039,
    "My Share Education": 10124, "My Share SCPI": 1145,
    "NCap Continent": 10122, "NCap Education Santé": 10123,
    "NCap Régions": 1140, "Novaxia Neo": 1099, "Novapierre 1": 1075,
    "Paref Evo": 10162, "Paref Hexa": 10164, "Paref Prima": 10163,
    "Perial Grand Paris": 1067, "Perial Hospitalité Europe": 1070,
    "Perial O2": 1068, "Perial Opportunités Europe": 1066,
    "Perial Opportunités Territoires": 1065,
    "Patrimmo Commerce": 10110, "Praemia Hotels Europe": 1131,
    "Primopierre": 1081, "Primovie": 1024,
    "Principal Inside": 1132, "Remake Live": 10017,
    "Remake UK 2025": 1127, "Efimmo 1": 1063,
    "Immorente": 1026, "Sofiboutique": 10023,
    "Sofidy Europe Invest": 10129, "Sofidynamic": 10128,
    "Sofipierre": 10020, "Coeur d'Avenir": 1102,
    "Coeur d'Europe": 10042, "Coeur de régions": 1103,
    "Coeur de ville": 10044, "Esprit Horizon": 10136,
    "ESG Pierre Capitale": 1134, "Mistral Sélection": 10130,
    "Telamon Borea": 1106, "LOG IN": 10036,
    "Urban Coeur Commerce": 1141, "Wemo One": 10131,
}

# Build name -> URL map
URL_MAP = {}
for name, iid in MANUAL_URLS.items():
    slug = name.lower()
    for c in "éèêëâàîïôûüç":
        slug = slug.replace(c, "e" if c in "éèêë" else "a" if c in "âà" else "i" if c in "îï" else "o" if c in "ô" else "u" if c in "ûü" else "c")
    slug = slug.replace("'", '').replace(' ', '-').replace('(', '').replace(')', '')
    URL_MAP[name] = f"https://www.ideal-investisseur.fr/scpi-avis/{slug}-{iid}.html"
# === STEP 3: Fetch + JSON extraction ===
sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher

def fetch_json(url, fetcher):
    """Fetch page and extract JSON data block"""
    try:
        page = fetcher.get(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
        if page.status != 200 or not page.body or len(page.body) < 500:
            return None
        html = page.body.decode('utf-8', errors='ignore')
        blocks = re.findall(r'<script[^>]*type="application/json"[^>]*>(.*?)</script>', html, re.S)
        for block in blocks:
            try:
                data = json.loads(block)
                if 'latest' in data:
                    return data
            except:
                pass
        
        # No JSON block — fall back to HTML text extraction
        return extract_from_html(html)
    except:
        return None

def extract_from_html(html):
    """Fallback: extract from HTML text"""
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
    html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th)[^>]*>', '\n', html, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html)
    for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&rsquo;',"'")]:
        text = text.replace(old, new)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n+', '\n', text)
    
    d = {'_source': 'html'}
    
    m = re.search(r'Capital\s+(fixe|variable)', text, re.I)
    if m: d['capital_type'] = m.group(1).capitalize()
    
    m = re.search(r"Taux\s+d'endettement\s+(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['taux_endettement'] = m.group(1)
    
    m = re.search(r"\bTOF\s+(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['tof'] = m.group(1)
    
    m = re.search(r'Prix\s+de\s+souscription\s+(\d[\d\s,.]*€)', text, re.I)
    if m: d['prix_souscription'] = m.group(1).strip().replace(' €','').replace(' ','')
    
    m = re.search(r'Valeur\s+de\s+reconstitution\s+(\d[\d\s,.]*€)', text, re.I)
    if m: d['valeur_reconstitution'] = m.group(1).strip().replace(' €','').replace(' ','')
    
    m = re.search(r"Taux\s+de\s+distribution\s*(?:\d{4})?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['taux_distribution'] = m.group(1)
    
    m = re.search(r"TRI\s+(?:à\s+)?10\s+ans?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['tri_10_ans'] = m.group(1)
    
    if re.search(r'Label\s+ISR\s*(Non|No)', text, re.I): d['label_isr'] = False
    elif re.search(r'Label\s+ISR', text, re.I): d['label_isr'] = True
    
    m = re.search(r'Capitalisation\s+(\d+[,.]?\d*)\s*(M€|Mds?€)', text, re.I)
    if m: d['capitalisation'] = m.group(1) + ' ' + m.group(2)
    
    m = re.search(r"(\d[\d\s]*)\s*parts?\s*en\s*(?:attente\s*de\s*)?retrait", text, re.I)
    if m: d['parts_en_attente_retrait'] = m.group(1).strip().replace(' ','')
    
    m = re.search(r"(?:Commission|Frais)\s+de\s+souscription\s+(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['comm_souscription'] = m.group(1)
    
    m = re.search(r"(?:Commission|Frais)\s+de\s+gestion\s+(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['comm_gestion'] = m.group(1)
    
    m = re.search(r"(?:Délai|Durée)\s+de\s+jouissance\s*:?\s*(\d+)\s*mois", text, re.I)
    if m: d['delai_jouissance'] = m.group(1) + ' mois'
    
    m = re.search(r"(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)", text, re.I)
    if m: d['souscription_min'] = m.group(1).strip()
    
    m = re.search(r"(?:Nombre\s+de\s+parts|Parts?\s+émises?)\s*:?\s*(\d[\d\s]*)", text, re.I)
    if m: d['nb_parts'] = m.group(1).strip().replace(' ','')
    
    return d if len(d) > 2 else None

def map_to_cols(data):
    """Map extracted data to sheet column letters"""
    d = {}
    latest = data.get('latest', {})
    is_json = '_source' not in data
    
    if is_json:
        # From JSON
        v = latest.get('taux_endettement')
        if v and str(v) not in ('0', '0.00', 'null'): d['K'] = str(v) + '%'
        
        v = latest.get('tof')
        if v and str(v) not in ('0', '0.00', 'null'): d['S'] = str(v) + '%'
        
        v = latest.get('prix_souscription')
        if v and str(v) != '0': d['V'] = str(v) + ' €'
        
        v = latest.get('valeur_reconstitution')
        if v and str(v) not in ('0', '0.00', 'null'): d['W'] = str(v) + ' €'
        
        v = latest.get('taux_distribution')
        if v and str(v) not in ('0', '0.00', 'null'): d['T'] = str(v) + '%'
        
        v = latest.get('tri_10_ans')
        if v and str(v) not in ('0', '0.00', 'null'): d['U'] = str(v) + '%'
        
        v = latest.get('label_isr')
        if v is not None: d['G'] = 'Oui' if str(v).lower() not in ('null','','non','0') else 'Non'
        
        v = latest.get('capitalisation')
        if v: d['J'] = str(v)
        
        v = latest.get('parts_en_attente_retrait')
        if v and str(v) not in ('0', 'null', ''): d['M'] = str(v)
        
        v = latest.get('nb_parts') or latest.get('nombre_parts')
        if v and str(v) not in ('0', 'null', ''): d['L'] = str(v)
        
        v = latest.get('comm_souscription')
        if v: d['X'] = str(v) + '% TTC'
        
        v = latest.get('comm_gestion')
        if v: d['Y'] = str(v) + '% TTC'
        
        v = latest.get('delai_jouissance')
        if v: d['Q'] = str(v)
        
        v = latest.get('souscription_min')
        if v: d['H'] = str(v)
        
        v = latest.get('capital_type')
        if v: d['D'] = str(v)
        
        v = latest.get('fiscalite')
        if v: d['P'] = str(v)
        
        # From scpi data
        scpi = data.get('scpi', {})
        v = scpi.get('date_creation')
        if v: d['I'] = str(v)
        
    else:
        # From HTML fallback
        for key, col in [
            ('taux_endettement', 'K'), ('tof', 'S'), ('taux_distribution', 'T'),
            ('tri_10_ans', 'U'), ('capitalisation', 'J'), ('comm_souscription', 'X'),
            ('comm_gestion', 'Y'), ('delai_jouissance', 'Q'), ('souscription_min', 'H'),
            ('nb_parts', 'L'), ('capital_type', 'D'),
        ]:
            v = data.get(key)
            if v:
                if col in ('K','S','T','U'): d[col] = str(v) + '%'
                elif col in ('X','Y'): d[col] = str(v) + '% TTC'
                else: d[col] = str(v)
        
        v = data.get('label_isr')
        if v is not None: d['G'] = 'Oui' if v else 'Non'
        
        v = data.get('parts_en_attente_retrait')
        if v and str(v) != '0': d['M'] = str(v)
        
        v = data.get('prix_souscription')
        if v: d['V'] = str(v) + ' €'
        
        v = data.get('valeur_reconstitution')
        if v and str(v) != '0': d['W'] = str(v) + ' €'
    
    return d

# === MAIN ===
rows = read_sheet()
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi: continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    if empty:
        needs[r] = (scpi, empty)

print(f"\nRows needing fixes: {len(needs)}")
print(f"URLs in map: {len(URL_MAP)}")

fetcher = Fetcher(auto_match=False)
total_cells = 0
total_rows = 0
results = {}

for r, (scpi, empty) in sorted(needs.items()):
    if scpi not in URL_MAP:
        results[scpi] = 'no_url'
        continue
    
    url = URL_MAP[scpi]
    data = fetch_json(url, fetcher)
    if not data:
        results[scpi] = 'no_data'
        continue
    
    mapped = map_to_cols(data)
    updates = [(col, mapped[col]) for col in empty if col in mapped]
    if not updates:
        results[scpi] = 'no_match'
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.12)
        else:
            print(f"{col}❌ ", end="", flush=True)
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    results[scpi] = f'{row_cells} cells'
    time.sleep(0.25)

# Summary
print(f"\n\n{'='*60}")
print(f"🏁 TOTAL: {total_rows} rows, {total_cells} cells filled")
no_url = [k for k,v in results.items() if v == 'no_url']
no_data = [k for k,v in results.items() if v == 'no_data']
no_match = [k for k,v in results.items() if v == 'no_match']
print(f"\nNo URL ({len(no_url)}): {', '.join(no_url[:10])}")
print(f"No data ({len(no_data)}): {', '.join(no_data[:10])}")
print(f"No match ({len(no_match)}): {', '.join(no_match[:10])}")

# Save results for agent
with open('/tmp/scpi_batch_results.json', 'w') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
