#!/usr/bin/env python3
"""
Pass 6: Find CBI + II URLs via SearXNG for remaining SCPI without data.
Then scrape advenis-reim.com and other gestionnaire sites for the rest.
"""
import json, re, subprocess, time, sys
sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher
from urllib.request import urlopen, Request
import urllib.parse

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"
SEARXNG = "http://192.168.1.127:8081/search"

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def html_to_text(body):
    html = body.decode('utf-8', errors='ignore') if isinstance(body, bytes) else body
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
    html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th)[^>]*>', '\n', html, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html)
    for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&rsquo;',"'"),('&#8217;',"'")]:
        text = text.replace(old, new)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n+', '\n', text)
    return text

def searxng_find(scpi, site):
    try:
        q = urllib.parse.quote(f"site:{site} scpi {scpi}")
        url = f"{SEARXNG}?q={q}&format=json"
        req = Request(url)
        resp = urlopen(req, timeout=8)
        data = json.loads(resp.read().decode('utf-8'))
        for r in data.get('results', []):
            u = r['url']
            if site in u and '.pdf' not in u:
                return u
    except:
        pass
    return None

def extract_generic(text):
    """Generic extraction from any SCPI page"""
    d = {}
    
    # D: Capital
    m = re.search(r'Capital\s+(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # G: ISR
    if re.search(r'Label\s+ISR', text, re.I): d['G'] = 'Oui'
    
    # K: Endettement
    m = re.search(r"[Tt]aux\s+d'endettement\s*(?:\(.*?\))?\s*:?\s*(\d+[,.]?\d*)\s*%", text)
    if m: d['K'] = m.group(1) + '%'
    
    # H: Souscription min
    m = re.search(r"(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)", text, re.I)
    if m: d['H'] = m.group(1).strip()
    
    # Q: Jouissance
    m = re.search(r"[Dd][ée]lai\s+de\s+jouissance\s*:?\s*(\d+\s*mois)", text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # L: Parts
    m = re.search(r"[Nn]ombre\s*de\s*parts?\s*:?\s*(\d[\d\s]*)", text)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # M: Parts retrait
    m = re.search(r"(\d[\d\s]*)\s*parts?\s*en\s*attente", text)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # N: RAN
    m = re.search(r"[Rr]eport\s*[àa]\s*nouveau\s*:?\s*(\d+\s*jours?\s*de\s*distribution)", text)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r"[Rr]eport\s*[àa]\s*nouveau\s*:?\s*([\d\s,.]+\s*€)", text)
        if m: d['N'] = m.group(1).strip()
    
    # O: PGE
    m = re.search(r"(?:Provision|Réserve)\s+pour\s+gros\s+entretien\s*:?\s*([\d\s,.]+\s*€)", text, re.I)
    if m: d['O'] = m.group(1).strip()
    
    # P: Fiscalite
    if re.search(r"impôt.*?sociétés.*?allemand|IS\s*15,825", text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r"revenus?\s+fonciers|PFU|prélèvement\s+forfaitaire", text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # R: Freq
    if re.search(r'trimestriel', text, re.I): d['R'] = 'Trimestrielle'
    elif re.search(r'semestriel', text, re.I): d['R'] = 'Semestrielle'
    
    # S: TOF
    m = re.search(r"(?:TOF|Taux\s+d'occupation\s+financière)\s*:?\s*(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['S'] = m.group(1) + '%'
    
    # T: PGA/TD
    m = re.search(r"(?:Performance\s+globale\s+annuelle|PGA|Taux\s+de\s+distribution)\s*(?:2025|\d{4})?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['T'] = m.group(1) + '%'
    
    # U: TRI
    m = re.search(r"TRI\s+(?:à\s+)?10\s+ans?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%", text, re.I)
    if m: d['U'] = m.group(1) + '%'
    
    # V: Prix
    m = re.search(r"(?:Valeur|Prix)\s+de\s+souscription\s*:?\s*(\d[\d\s,.]*\s*€)", text, re.I)
    if m: d['V'] = m.group(1).strip()
    
    # W: Reconstitution
    m = re.search(r"Valeur\s+de\s+reconstitution\s*:?\s*(\d[\d\s,.]*\s*€)", text, re.I)
    if m: d['W'] = m.group(1).strip()
    
    # X: Commission SS
    m = re.search(r"(?:Commission|Frais)\s+de\s+souscription\s*:?\s*(\d+[,.]?\d*)\s*%\s*(?:\((HT|TTC)\))?", text, re.I)
    if m: d['X'] = m.group(1) + (' ' + m.group(2) if m.group(2) else ' TTC')
    
    # Y: Commission gestion
    m = re.search(r"(?:Commission|Frais)\s+de\s+gestion\s*:?\s*(\d+[,.]?\d*)\s*%\s*(?:\((HT|TTC)\))?", text, re.I)
    if m: d['Y'] = m.group(1) + (' ' + m.group(2) if m.group(2) else ' TTC')
    
    # E: Geo
    m = re.search(r"[Rr]épartition\s*géographique[^:]*:?\s*\n((?:[^\n]+\n){2,12})", text)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    # F: Secteur
    m = re.search(r"[Rr]épartition\s*(?:par\s+)?secteur[^:]*:?\s*\n((?:[^\n]+\n){2,12})", text)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    # I: Date
    m = re.search(r"(?:Date|Année)\s+de\s+création\s*:?\s*(\w+\s+\d{4}|\d{4})", text, re.I)
    if m: d['I'] = m.group(1).strip()
    
    # J: Cap
    m = re.search(r"Capitalisation\s*:?\s*(\d+[,.]?\d*\s*(?:milliards?|Mds?|M)\s*€?)", text, re.I)
    if m: d['J'] = m.group(1).strip()
    
    return d

# Read sheet
rows = read_sheet()

# Get SCPI names that still need fixes
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi: continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    if empty:
        needs[r] = (scpi, empty)

print(f"Total rows needing fixes: {len(needs)}")

# Search strategies per site
SITES = [
    "ideal-investisseur.fr/scpi-avis",
    "commentbieninvestir.fr",
    "advenis-reim.com",
    "prd-information.com",
    "lalouviere.com",
]

fetcher = Fetcher(auto_match=False)
total_cells = 0
total_rows = 0
failed = []

for r, (scpi, empty) in sorted(needs.items()):
    text = None
    found_url = None
    
    # Try each source
    for site in SITES:
        url = searxng_find(scpi, site)
        if url:
            try:
                page = fetcher.get(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
                if page.status == 200 and page.body and len(page.body) > 1000:
                    text = html_to_text(page.body)
                    found_url = url
                    break
            except:
                pass
    
    if not text:
        failed.append(scpi)
        continue
    
    data = extract_generic(text)
    updates = [(col, data[col]) for col in empty if col in data]
    if not updates:
        failed.append(f"{scpi} (no matching data)")
        continue
    
    print(f"Row {r:3d} ({scpi:35s}) [{found_url[:40]}]: ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.15)
        else:
            print(f"{col}❌ ", end="", flush=True)
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled")
print(f"\nFailed/Skipped ({len(failed)}): {', '.join(failed[:20])}")
