#!/usr/bin/env python3
"""
Pass 3: Use SearXNG to find ideal-investisseur URLs, then scrape and fill gaps.
Focus columns: K (endettement), L (parts), M (retrait), N (RAN), O (PGE), 
Q (jouissance), S (TOF), T (PGA), U (TRI), X/Y (commissions), G (ISR), D (capital)
"""
import json, re, subprocess, time, sys, os
sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages')
from scrapling import Fetcher
from urllib.request import urlopen, Request
import urllib.parse

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def html_to_text(html):
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
    html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th|dd|dt|table|thead|tbody|span|strong|em|b|i)[^>]*>', '\n', html, flags=re.I)
    text = re.sub(r'<[^>]+>', ' ', html)
    for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&lt;','<'),('&gt;','>'),('&rsquo;',"'"),('&laquo;','"'),('&raquo;','"')]:
        text = text.replace(old, new)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n+', '\n', text)
    return text

def searxng_find_ii_url(scpi_name):
    """Find ideal-investisseur URL via SearXNG"""
    try:
        q = urllib.parse.quote(f"site:ideal-investisseur.fr/scpi-avis {scpi_name}")
        url = f"http://192.168.1.127:8081/search?q={q}&format=json"
        req = Request(url)
        resp = urlopen(req, timeout=8)
        data = json.loads(resp.read().decode('utf-8'))
        for r in data.get('results', []):
            u = r['url']
            if 'ideal-investisseur.fr/scpi-avis/' in u and u.endswith('.html'):
                return u
            if 'ideal-investisseur.fr/scpi/' in u:
                return u
    except:
        pass
    return None

def searxng_find_cbi_url(scpi_name):
    """Find commentbieninvestir URL via SearXNG"""
    try:
        q = urllib.parse.quote(f"site:commentbieninvestir.fr scpi {scpi_name}")
        url = f"http://192.168.1.127:8081/search?q={q}&format=json"
        req = Request(url)
        resp = urlopen(req, timeout=8)
        data = json.loads(resp.read().decode('utf-8'))
        for r in data.get('results', []):
            u = r['url']
            if 'commentbieninvestir.fr' in u:
                return u
    except:
        pass
    return None

def extract_ii(text):
    """Extract data from ideal-investisseur page"""
    d = {}
    
    # K: Endettement  
    m = re.search(r'(?:Taux\s+d\'endettement|Endettement)\s*(?:\(.*?\))?\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['K'] = m.group(1).replace(',', '.') + '%'
    
    # L: Nombre parts
    m = re.search(r'(?:Nombre\s+de\s+parts|Parts?\s+émises?|Nb\s+parts)\s*:?\s*(\d[\d\s]*)', text, re.I)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    # M: Parts retrait
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*attente', text, re.I)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    # N: RAN
    m = re.search(r'Report\s+à\s+nouveau\s*:?\s*(\d+\s*jours?\s*de\s*distribution)', text, re.I)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r'Report\s+à\s+nouveau\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
        if m: d['N'] = m.group(1).strip()
        else:
            m = re.search(r'Report\s+à\s+nouveau\s*:?\s*(\d+[\d\s]*)', text, re.I)
            if m:
                val = m.group(1).strip()
                if len(val) > 4:  # Large number = euros
                    d['N'] = val + ' €'
    
    # O: PGE
    m = re.search(r'(?:Provision|Réserve)\s+pour\s+gros\s+entretien\s*:?\s*([\d\s,.]+\s*€)', text, re.I)
    if m: d['O'] = m.group(1).strip()
    
    # Q: Delai jouissance
    m = re.search(r'Délai\s+de\s+jouissance\s*:?\s*(\d+\s*mois)', text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    # S: TOF
    m = re.search(r'(?:TOF|Taux\s+d\'occupation\s+financière)\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['S'] = m.group(1).replace(',', '.') + '%'
    
    # T: PGA
    m = re.search(r'(?:Performance\s+globale\s+annuelle|PGA)\s*(?:2025|2024)?\s*:?\s*[−–]?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['T'] = m.group(1).replace(',', '.') + '%'
    
    # U: TRI
    m = re.search(r'TRI\s+(?:à\s+)?10\s+ans?\s*:?\s*[−–]?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m: d['U'] = m.group(1).replace(',', '.') + '%'
    
    # V: Prix souscription
    m = re.search(r'(?:Valeur|Prix)\s+de\s+souscription\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['V'] = m.group(1).strip()
    
    # W: Valeur reconstitution
    m = re.search(r'Valeur\s+de\s+reconstitution\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['W'] = m.group(1).strip()
    
    # X: Commission souscription
    m = re.search(r'(?:Commission|Frais)\s+de\s+souscription\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m:
        rest = text[m.end():m.end()+30]
        d['X'] = m.group(1).replace(',', '.') + '% TTC' if 'TTC' in rest else m.group(1).replace(',', '.') + '% HT'
    
    # Y: Commission gestion
    m = re.search(r'(?:Commission|Frais)\s+de\s+gestion\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I)
    if m:
        rest = text[m.end():m.end()+30]
        d['Y'] = m.group(1).replace(',', '.') + '% TTC' if 'TTC' in rest else m.group(1).replace(',', '.') + '% HT'
    
    # G: Label ISR
    if re.search(r'Label\s+ISR', text, re.I): d['G'] = 'Oui'
    
    # D: Type capital
    m = re.search(r'Capital\s+(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    # H: Souscription min
    m = re.search(r'(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I)
    if m: d['H'] = m.group(1).strip()
    
    # P: Fiscalite
    if re.search(r'impôt\s+sur\s+les\s+sociétés\s+allemand|IS\s*15,825', text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r'source étrangère|convention[^.]*fiscale', text, re.I):
        d['P'] = 'Revenus source étrangère (conventions fiscales)'
    elif re.search(r'revenus?\s+fonciers|PFU|prélèvement\s+forfaitaire', text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # E: Secteur geo
    m = re.search(r'[Rr]épartition\s*géographique[^:]*:?\s*\n((?:[^\n]+\n){2,12})', text)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    # F: Secteur activite
    m = re.search(r'[Rr]épartition\s*(?:par\s+)?secteur[^:]*:?\s*\n((?:[^\n]+\n){2,12})', text)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    # I: Date creation
    m = re.search(r'(?:Date\s+de\s+création|Année\s+de\s+création|Créée?\s+en)\s*:?\s*(\w+\s+\d{4}|\d{4})', text, re.I)
    if m: d['I'] = m.group(1).strip()
    
    # J: Capitalisation
    m = re.search(r'Capitalisation\s*:?\s*(\d+[,.]?\d*\s*(?:milliards?|Mds?|M)\s*€?)', text, re.I)
    if m: d['J'] = m.group(1).strip()
    
    # R: Frequence
    if re.search(r'trimestriel', text, re.I): d['R'] = 'Trimestrielle'
    elif re.search(r'semestriel', text, re.I): d['R'] = 'Semestrielle'
    
    return d

def extract_cbi(text):
    """Extract from commentbieninvestir - similar but different patterns"""
    d = {}
    
    m = re.search(r'[Cc]apital\s*(fixe|variable)', text, re.I)
    if m: d['D'] = m.group(1).capitalize()
    
    if re.search(r'[Ll]abel\s+ISR', text): d['G'] = 'Oui'
    
    m = re.search(r'[Tt]aux\s+d\'endettement\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['K'] = m.group(1) + '%'
    
    m = re.search(r'[Ff]rais\s*de\s*souscription\s*\n\s*(\d+[,.]?\d*%)\s*(HT|TTC)', text)
    if m: d['X'] = m.group(1) + ' ' + m.group(2)
    
    m = re.search(r'[Ff]rais\s*de\s*gestion\s*\n\s*(\d+[,.]?\d*%)\s*(HT|TTC)', text)
    if m: d['Y'] = m.group(1) + ' ' + m.group(2)
    
    m = re.search(r'[Dd][ée]lai\s*de\s*jouissance\s*\n?\s*:?\s*(\d+\s*mois)', text, re.I)
    if m: d['Q'] = m.group(1).strip()
    
    m = re.search(r'[Nn]ombre\s*de\s*parts?\s*\n?\s*:?\s*(\d[\d\s]*)', text)
    if m: d['L'] = m.group(1).strip().replace(' ', '')
    
    m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*attente', text)
    if m: d['M'] = m.group(1).strip().replace(' ', '')
    
    m = re.search(r'[Rr]eport\s*[àa]\s*nouveau\s*\n\s*(\d+\s*jours?\s*de\s*distribution)', text)
    if m: d['N'] = m.group(1).strip()
    else:
        m = re.search(r'[Rr]eport\s*[àa]\s*nouveau\s*\n?\s*:?\s*([\d\s,.]+€)', text)
        if m: d['N'] = m.group(1).strip()
    
    m = re.search(r'[Pp]rovision.*?[Gg]ros.*?[Ee]ntretien\s*\n?\s*:?\s*([\d\s,.]+)', text)
    if m: d['O'] = m.group(1).strip()
    
    m = re.search(r'[Vv]aleur\s*de\s*reconstitution\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['W'] = m.group(1).strip()
    
    m = re.search(r'[Pp]rix\s*de\s*(?:la\s*part|souscription)\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['V'] = m.group(1).strip()
    
    m = re.search(r'[Pp]erformance\s*globale\s*annuelle\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['T'] = m.group(1) + '%'
    
    m = re.search(r'TRI\s*10\s*ans\s*\n\s*(\d+[,.]\d+)\s*%', text)
    if m: d['U'] = m.group(1) + '%'
    
    m = re.search(r'[Ii]nvestissement\s*minimum\s*\n\s*(\d[\d\s,.]*€)', text)
    if m: d['H'] = m.group(1).strip()
    
    if re.search(r'impôt.*?sociétés.*?allemand|IS\s*15,825', text, re.I):
        d['P'] = 'Allemande (IS 15,825%)'
    elif re.search(r'revenus?\s+fonciers|PFU', text, re.I):
        d['P'] = 'Française (revenus fonciers / PFU 30%)'
    
    # E/F from CBI
    m = re.search(r'[Rr][ée]partition\s*[Gg][ée]ographique[^"]*?\n((?:[^\n]+\n){2,12})', text)
    if m:
        geo = m.group(1)
        places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo)
        if places:
            d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6])
    
    m = re.search(r'[Rr][ée]partition\s*[Ss]ectorielle[^"]*?\n((?:[^\n]+\n){2,12})', text)
    if m:
        sect = m.group(1)
        acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect)
        if acts:
            d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6])
    
    if re.search(r'trimestrielle', text, re.I): d['R'] = 'Trimestrielle'
    elif re.search(r'semestrielle', text, re.I): d['R'] = 'Semestrielle'
    
    return d

# Read sheet
rows = read_sheet()
headers = rows[0]

# Build list of rows needing fixes
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi:
        continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            empty[col] = j
    if empty:
        needs[r] = (scpi, empty)

print(f"Rows needing fixes: {len(needs)}")

fetcher = Fetcher(auto_match=False)
total_cells = 0
total_rows = 0
cache = {}

for r, (scpi, empty) in sorted(needs.items()):
    text = None
    source = None
    
    # Try ideal-investisseur first via SearXNG
    ii_url = searxng_find_ii_url(scpi)
    if ii_url and ii_url not in cache.get('ii_fail', set()):
        try:
            page = fetcher.get(ii_url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
            if page.status == 200 and len(page.text) > 2000:
                text = html_to_text(page.text)
                source = 'ii'
        except:
            pass
        if not text:
            cache.setdefault('ii_fail', set()).add(ii_url)
    
    # Try commentbieninvestir as fallback
    if not text:
        cbi_url = searxng_find_cbi_url(scpi)
        if cbi_url and cbi_url not in cache.get('cbi_fail', set()):
            try:
                page = fetcher.get(cbi_url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'})
                if page.status == 200 and len(page.text) > 2000:
                    text = html_to_text(page.text)
                    source = 'cbi'
            except:
                pass
            if not text:
                cache.setdefault('cbi_fail', set()).add(cbi_url)
    
    if not text:
        continue
    
    data = extract_ii(text) if source == 'ii' else extract_cbi(text)
    if not data:
        continue
    
    updates = [(col, data[col]) for col in empty if col in data]
    if not updates:
        continue
    
    print(f"Row {r:3d} ({scpi:35s}) [{source}]: ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            time.sleep(0.15)
        else:
            print(f"{col}❌ ", end="", flush=True)
    
    print(f"({row_cells})")
    total_rows += 1
    total_cells += row_cells
    time.sleep(0.3)

print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled")
