#!/usr/bin/env python3
"""
SCPI Batch Filler v2 — Better extraction from ideal-investisseur JSON.
Focuses on fillable columns: K, L, M, H, P, Q, X, Y, D, U, T, S, V, W, G, J
"""
import json, re, subprocess, time, sys, os

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsettes', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

# Fix typo
def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

# URL map
MANUAL_URLS = {
    "Eden": 10150, "Elialys": 1003, "Eurovalys": 1002, "Aestiam Agora": 1028,
    "Aestiam Horizon": 1071, "Linaclub": 1004, "AEW Commerces Europe": 10058,
    "AEW Diversification Allemagne": 10025, "AEW Opportunités Europe": 10026,
    "AEW Patrimoine Santé": 10027, "Atout Pierre Diversification": 1146,
    "Activimmo": 10009, "Comète": 10045, "Allianz Pierre": 1030,
    "Alta Convictions": 10153, "Altixia Cadence XII": 1004,
    "Altixia Commerces": 1006, "Edissimmo": 1054, "Genepierre": 1077,
    "Rivoli Avenir Patrimoine": 1025, "MomenTime": 10149,
    "Transitions Europe": 10040, "New Gen": 10021,
    "Epargne Pierre": 10014, "Epargne Pierre Europe": 10143,
    "Epargne Pierre Sophia": 10144, "Atream Hôtels": 1032,
    "Upeka": 10147, "Accès Valeur Pierre": 1029, "Accimmo Pierre": 1050,
    "Imarea Pierre": 10111, "Opus Real": 1119, "Optimale": 10005,
    "Corum Eurion": 10000, "Corum Origin": 1000, "Corum USA": 1001,
    "Corum XL": 1097, "Darwin RE01": 10138,
    "Edmond de Rothschild Europa": 10133, "Elevation Tertiom": 10137,
    "Epsicap Explore": 10019, "Epsicap Nano": 10034,
    "Euryale Horizons Santé": 10035, "Pierval Santé": 1055,
    "Buroboutic Métropoles": 10116, "Ficommerce Proximité": 10115,
    "Logipierre 3 Résidences Services": 10118,
    "Pierre Expansion Santé": 10117, "Selectipierre 2 - Paris": 1072,
    "Cap Foncières et Territoires": 1045, "GMA Essentialis": 10008,
    "Affinités Pierre": 1031, "Attraits Pierre": 10141,
    "Elysées Grand Large": 10103, "Elysées Pierre": 1041,
    "Cristal Life": 10033, "Cristal Rente": 10033,
    "Iroko Atlas": 10006, "Iroko Zen": 10007,
    "Kyaneos Pierre": 10032, "Crédit Mutuel Pierre 1": 1065,
    "Epargne Foncière": 1001, "LF Avenir Santé": 1062,
    "LF Croissance et Territoires": 10152, "LF Europimmo": 1129,
    "LF Grand Paris Patrimoine": 1114, "LF Opportunité Immo": 1115,
    "Selectinvest 1": 1126, "Foncière Des Praticiens": 1144,
    "Osmo Energie": 1147, "Reason": 10039,
    "My Share Education": 10124, "My Share SCPI": 1145,
    "NCap Continent": 10122, "NCap Education Santé": 10123,
    "NCap Régions": 1140, "Novaxia Neo": 1099, "Novapierre 1": 1075,
    "Paref Evo": 10162, "Paref Hexa": 10164, "Paref Prima": 10163,
    "Perial Grand Paris": 1067, "Perial Hospitalité Europe": 1070,
    "Perial O2": 1068, "Perial Opportunités Europe": 1066,
    "Perial Opportunités Territoires": 1065,
    "Patrimmo Commerce": 10110, "Praemia Hotels Europe": 1131,
    "Primopierre": 1081, "Primovie": 1024,
    "Principal Inside": 1132, "Remake Live": 10017,
    "Remake UK 2025": 1127, "Efimmo 1": 1063,
    "Immorente": 1026, "Sofiboutique": 10023,
    "Sofidy Europe Invest": 10129, "Sofidynamic": 10128,
    "Sofipierre": 10020, "Coeur d'Avenir": 1102,
    "Coeur d'Europe": 10042, "Coeur de régions": 1103,
    "Coeur de ville": 10044, "Esprit Horizon": 10136,
    "ESG Pierre Capitale": 1134, "Mistral Sélection": 10130,
    "Telamon Borea": 1106, "LOG IN": 10036,
    "Urban Coeur Commerce": 1141, "Wemo One": 10131,
}

ACCENTS = {'é':'e','è':'e','ê':'e','ë':'e','â':'a','à':'a','î':'i','ï':'i','ô':'o','û':'u','ü':'u','ç':'c'}
URL_MAP = {}
for name, iid in MANUAL_URLS.items():
    slug = name.lower()
    for c, r in ACCENTS.items():
        slug = slug.replace(c, r)
    slug = slug.replace("'", "").replace(" ", "-").replace("(", "").replace(")", "")
    URL_MAP[name] = f"https://www.ideal-investisseur.fr/scpi-avis/{slug}-{iid}.html"

sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher

def fetch_and_extract(url, fetcher):
    """Fetch page and extract data from JSON blocks"""
    try:
        page = fetcher.get(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
        if page.status != 200 or not page.body or len(page.body) < 500:
            return None
        html = page.body.decode('utf-8', errors='ignore')
        
        # Extract JSON blocks
        blocks = re.findall(r'<script[^>]*type="application/json"[^>]*>(.*?)</script>', html, re.S)
        for block in blocks:
            try:
                data = json.loads(block)
                if 'latest' in data:
                    return data
            except:
                pass
        
        # No JSON — try HTML extraction
        return extract_from_html(html)
    except Exception as e:
        print(f"    Error: {e}")
        return None

def extract_from_html(html):
    """Fallback HTML text extraction"""
    html2 = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html2 = re.sub(r'<style[^>]*>.*?</style>', '', html2, flags=re.S|re.I)
    html2 = re.sub(r'<br\s*/?>', '\n', html2, flags=re.I)
    html2 = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th)[^>]*>', '\n', html2, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html2)
    for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&rsquo;',"'"),('&agrave;','à'),('&eacute;','é')]:
        text = text.replace(old, new)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n+', '\n', text)
    
    d = {'_source': 'html'}
    
    # Try to find key-value patterns
    patterns = {
        'taux_endettement': [r"Taux\s+d'endettement\s*:?\s*([0-9]+[,.]?[0-9]*)\s*%", r"endettement\s*:?\s*([0-9]+[,.]?[0-9]*)"],
        'tof': [r"TOF\s*:?\s*([0-9]+[,.]?[0-9]*)\s*%", r"taux\s+d'occupation[^:]*:\s*([0-9]+[,.]?[0-9]*)"],
        'prix_souscription': [r"Prix\s+de\s+souscription\s*:?\s*([0-9\s,.]+)\s*€", r"souscription\s*:?\s*([0-9\s,.]+)\s*€"],
        'taux_distribution': [r"Taux\s+de\s+distribution[^:]*:\s*([0-9]+[,.]?[0-9]*)\s*%"],
        'tri_10_ans': [r"TRI\s+(?:à\s+)?10\s+ans?\s*:?\s*([0-9]+[,.]?[0-9]*)\s*%"],
        'comm_souscription': [r"(?:Commission|Frais)\s+de\s+souscription\s*:?\s*([0-9]+[,.]?[0-9]*)\s*%"],
        'comm_gestion': [r"(?:Commission|Frais)\s+de\s+gestion\s*:?\s*([0-9]+[,.]?[0-9]*)\s*%"],
        'capitalisation': [r"Capitalisation\s*:?\s*([0-9]+[,.]?[0-9]*\s*(?:M€|Mds?€|Md€))"],
    }
    
    for key, pats in patterns.items():
        for pat in pats:
            m = re.search(pat, text, re.I)
            if m:
                d[key] = m.group(1).strip()
                break
    
    return d if len(d) > 2 else None

def map_data_to_cols(data, empty_cols):
    """Map extracted data to needed sheet columns"""
    result = {}
    latest = data.get('latest', {})
    scpi = data.get('scpi', {})
    is_json = '_source' not in data
    
    def val(key):
        v = latest.get(key)
        if v is None or str(v).strip() in ('', 'null', 'None'):
            return None
        return str(v).strip()
    
    if is_json:
        # K: Taux d'endettement
        if 'K' in empty_cols:
            v = val('taux_endettement')
            if v and v not in ('0', '0.00', '0,00'):
                # Check if already has %
                v_clean = v.replace(',', '.')
                try:
                    num = float(v_clean)
                    if num > 0:
                        result['K'] = f"{v.replace('.', ',')}%"
                except:
                    pass
        
        # L: Nombre de parts (from nb_parts_retrait or souscriptions - not directly in JSON typically)
        # The JSON doesn't have nb_parts directly, but some do
        # Skip for now unless found
        
        # M: Parts en attente de retrait
        if 'M' in empty_cols:
            v = val('parts_en_attente_retrait')
            if v and v not in ('0', '0.00'):
                result['M'] = v
        
        # D: Type de capital (not in JSON typically)
        
        # H: Souscription minimale (not directly in JSON)
        
        # P: Fiscalité (not in JSON)
        
        # Q: Délai de jouissance (not in JSON)
        
        # S: TOF
        if 'S' in empty_cols:
            v = val('tof')
            if v and v not in ('0', '0.00', '0,00'):
                v_clean = v.replace(',', '.')
                try:
                    num = float(v_clean)
                    if num > 0:
                        result['S'] = f"{v.replace('.', ',')}%"
                except:
                    pass
        
        # T: PGA (Performance Globale Annuelle) = taux_distribution in their data
        if 'T' in empty_cols:
            v = val('taux_distribution')
            if v and v not in ('0', '0.00', '0,00'):
                v_clean = v.replace(',', '.')
                try:
                    num = float(v_clean)
                    if num > 0:
                        result['T'] = f"{v.replace('.', ',')}%"
                except:
                    pass
        
        # U: TRI 10 ans
        if 'U' in empty_cols:
            v = val('tri_10_ans')
            if v and v not in ('0', '0.00', '0,00'):
                v_clean = v.replace(',', '.')
                try:
                    num = float(v_clean)
                    if num > 0:
                        result['U'] = f"{v.replace('.', ',')}%"
                except:
                    pass
        
        # V: Valeur de Souscription
        if 'V' in empty_cols:
            v = val('prix_souscription')
            if v and v != '0':
                result['V'] = f"{v.replace('.', ',')}€"
        
        # W: Valeur de Reconstitution
        if 'W' in empty_cols:
            v = val('valeur_reconstitution')
            if v and v not in ('0', '0.00', '0,00'):
                result['W'] = f"{v.replace('.', ',')}€"
        
        # X: Commission de Souscription (not typically in JSON)
        
        # Y: Commission de Gestion (not typically in JSON)
        
        # G: Label ISR
        if 'G' in empty_cols:
            v = val('label_isr')
            if v is not None:
                if v.lower() in ('true', '1', 'oui', 'yes'):
                    result['G'] = 'Oui'
                elif v.lower() in ('false', '0', 'non', 'no'):
                    result['G'] = 'Non'
        
        # J: Capitalisation
        if 'J' in empty_cols:
            v = val('capitalisation')
            if v:
                result['J'] = v
        
        # K: taux_endettement (already handled)
        
    else:
        # HTML fallback
        for key, col, suffix in [
            ('taux_endettement', 'K', '%'),
            ('tof', 'S', '%'),
            ('taux_distribution', 'T', '%'),
            ('tri_10_ans', 'U', '%'),
            ('comm_souscription', 'X', '% TTC'),
            ('comm_gestion', 'Y', '% TTC'),
        ]:
            if col in empty_cols:
                v = data.get(key)
                if v:
                    v_clean = v.replace(',', '.').replace(' ', '')
                    try:
                        num = float(v_clean)
                        if num > 0:
                            result[col] = f"{v}{suffix}"
                    except:
                        pass
        
        v = data.get('capitalisation')
        if v and 'J' in empty_cols:
            result['J'] = v
        
        v = data.get('prix_souscription')
        if v and 'V' in empty_cols:
            result['V'] = f"{v}€"
    
    return result

# === MAIN ===
print("Reading sheet...")
rows = read_sheet()

# Find rows with gaps
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi:
        continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    if empty:
        needs[r] = (scpi, empty)

print(f"Rows needing data: {len(needs)}")

# Columns we can fill from ideal-investisseur JSON
FILLABLE = {'D', 'G', 'H', 'J', 'K', 'L', 'M', 'P', 'Q', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'}
# Columns that need other sources (HTML scraping or PDFs)
NEEDS_PDF = {'N', 'O'}  # RAN, PGE

fetcher = Fetcher(auto_match=False)
total_cells = 0
total_rows = 0
no_url = []
no_data = []
no_match = []
skipped = []

for r, (scpi, empty) in sorted(needs.items()):
    # Skip columns N, O (RAN, PGE) — need PDFs
    fillable_empty = {col: idx for col, idx in empty.items() if col in FILLABLE}
    if not fillable_empty:
        skipped.append(scpi)
        continue
    
    if scpi not in URL_MAP:
        no_url.append(scpi)
        continue
    
    url = URL_MAP[scpi]
    data = fetch_and_extract(url, fetcher)
    if not data:
        no_data.append(scpi)
        continue
    
    mapped = map_data_to_cols(data, fillable_empty)
    updates = [(col, mapped[col]) for col in fillable_empty if col in mapped]
    if not updates:
        no_match.append(scpi)
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.12)
        else:
            print(f"{col}❌({value[:20]}) ", end="", flush=True)
    print(f"({row_cells}/{len(fillable_empty)} fillable)")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.25)

print(f"\n{'='*60}")
print(f"🏁 TOTAL: {total_rows} rows, {total_cells} cells filled")
print(f"\nNo URL ({len(no_url)}): {', '.join(no_url)}")
print(f"No data ({len(no_data)}): {', '.join(no_data)}")
print(f"No match ({len(no_match)}): {', '.join(no_match)}")
print(f"Skipped (only N/O gaps) ({len(skipped)}): {', '.join(skipped[:5])}...")

with open('/tmp/scpi_batch_v2_results.json', 'w') as f:
    json.dump({'total_cells': total_cells, 'total_rows': total_rows, 'no_url': no_url, 'no_data': no_data, 'no_match': no_match}, f, ensure_ascii=False, indent=2)