#!/usr/bin/env python3
"""SCPI Batch Filler v5 — scpi-lab.com extraction for remaining gaps.
Uses web_fetch to get scpi-lab pages (no JS rendering needed)."""
import json, re, subprocess, time, sys, urllib.request

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def web_fetch_text(url, max_chars=10000):
    """Fetch URL and extract readable text"""
    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
            'Accept': 'text/html'
        })
        resp = urllib.request.urlopen(req, timeout=15)
        html = resp.read().decode('utf-8', errors='ignore')
        
        # Simple text extraction
        html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
        html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S|re.I)
        html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
        html = re.sub(r'</?(?:p|div|h[1-6]|li|tr|td|th)[^>]*>', '\n', html, flags=re.I)
        text = re.sub(r'<[^>]+>', ' ', html)
        for old, new in [('&nbsp;',' '),('&amp;','&'),('&euro;','€'),('&rsquo;',"'"),
                         ('&lt;','<'),('&gt;','>'),('&#39;',"'"),('&quot;','"')]:
            text = text.replace(old, new)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n\s*\n+', '\n', text)
        return text[:max_chars]
    except:
        return None

# scpi-lab.com URL patterns - need to search for each SCPI
# Use the search on scpi-lab
def find_scpi_lab_url(scpi_name):
    """Search scpi-lab for a SCPI page URL"""
    try:
        # Direct search
        query = f"site:scpi-lab.com {scpi_name} SCPI"
        url = f"http://192.168.1.127:8081/search?q={urllib.request.quote(query)}&format=json"
        req = urllib.request.Request(url)
        resp = urllib.request.urlopen(req, timeout=10)
        data = json.loads(resp.read().decode())
        for r in data.get('results', [])[:3]:
            u = r.get('url', '')
            if 'scpi-lab.com/scpi/' in u and '/information/' not in u:
                return u
    except:
        pass
    return None

# Read sheet
rows = read_sheet()

# Build needs
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi: continue
    empty = {}
    for j in range(3, 25):
        val = row[j].strip() if j < len(row) else ''
        if val in ('', 'N/A', 'Non trouvé', '-'):
            col = chr(65+j)
            if col not in {'N', 'O'}:
                empty[col] = j
    if empty:
        needs[r] = (scpi, empty)

print(f"Rows with fillable gaps: {len(needs)}")

# Instead of searching, let's try fetching scpi-lab directly with known slugs
# First, search for all SCPIs on scpi-lab using their search
# Or use the sitemap
print("Fetching scpi-lab sitemap...")
try:
    req = urllib.request.Request("https://www.scpi-lab.com/sitemap.xml", 
                                  headers={'User-Agent': 'Mozilla/5.0'})
    resp = urllib.request.urlopen(req, timeout=15)
    xml = resp.read().decode('utf-8')
    lab_urls = re.findall(r'<loc>(https://www\.scpi-lab\.com/scpi/[^<]+)</loc>', xml)
    # Filter to main SCPI pages (not /information/ subpages)
    lab_urls = [u for u in lab_urls if '/information/' not in u]
    print(f"Found {len(lab_urls)} SCPI pages on scpi-lab")
except Exception as e:
    lab_urls = []
    print(f"Sitemap error: {e}")

# Build name -> URL map
def normalize(s):
    s = s.lower().strip()
    for c in 'éèêë': s = s.replace(c, 'e')
    for c in 'âà': s = s.replace(c, 'a')
    for c in 'îï': s = s.replace(c, 'i')
    for c in 'ô': s = s.replace(c, 'o')
    for c in 'ûü': s = s.replace(c, 'u')
    s = s.replace('ç', 'c').replace("'", "").replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
    return s

lab_map = {}
for u in lab_urls:
    # Extract SCPI name from URL: scpi-lab.com/scpi/scpi-name-id
    m = re.search(r'/scpi/(?:scpi-)?([^/?#]+?)(?:-\d+)?$', u)
    if m:
        slug = m.group(1)
        norm = normalize(slug.replace('scpi-', ''))
        lab_map[norm] = u

print(f"Lab map entries: {len(lab_map)}")

# Process rows
total = 0
for r, (scpi, empty) in sorted(needs.items()):
    norm = normalize(scpi)
    url = lab_map.get(norm)
    
    if not url:
        # Try partial match
        for k, v in lab_map.items():
            if norm in k or k in norm:
                url = v
                break
    
    if not url:
        continue
    
    text = web_fetch_text(url, 8000)
    if not text:
        continue
    
    updates = []
    
    # K: endettement - look for "Endettement" section
    m = re.search(r'endettement[^0-9]*?(\d+[,.]?\d*)\s*%', text, re.I)
    if m and 'K' in empty:
        val = m.group(1)
        try:
            num = float(val.replace(',', '.'))
            if 0.1 <= num <= 100:
                updates.append(('K', f"{val}%"))
        except:
            pass
    
    # M: parts en attente - look for "Parts en attente"
    m = re.search(r'(?:Parts?|attente)\s+(?:en\s+attente|de\s+retrait)[^0-9]*(\d[\d\s]*)', text, re.I)
    if m and 'M' in empty:
        val = m.group(1).strip().replace(' ', '')
        if val and val != '0':
            updates.append(('M', val))
    
    # U: TRI 10 ans
    m = re.search(r'TRI\s+(?:à\s+)?10\s+ans?[^0-9]*?([−–-]?\d+[,.]?\d*)\s*%', text, re.I)
    if m and 'U' in empty:
        val = m.group(1).replace('−', '-').replace('–', '-')
        try:
            num = float(val.replace(',', '.'))
            if -50 <= num <= 50:
                updates.append(('U', f"{val}%"))
        except:
            pass
    
    # T: PGA / taux de distribution
    m = re.search(r'(?:Distribution\s+brute?\s+2025|taux\s+de\s+distribution)[^0-9]*?(\d+[,.]?\d*)\s*%', text, re.I)
    if m and 'T' in empty:
        val = m.group(1)
        try:
            num = float(val.replace(',', '.'))
            if 0.1 <= num <= 30:
                updates.append(('T', f"{val}%"))
        except:
            pass
    
    if not updates:
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    row_cells = 0
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            row_cells += 1
            total += 1
        else:
            print(f"{col}❌ ", end="", flush=True)
        time.sleep(0.12)
    print(f"({row_cells})")
    time.sleep(0.3)

print(f"\n{'='*60}")
print(f"🏁 TOTAL filled: {total}")
PYEOF