#!/usr/bin/env python3
"""
SCPI PDF Batch — Download bulletins trimestriels from scpi-lab, extract RAN & PGE.
"""
import json, re, subprocess, time, sys, os, urllib.request

SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM"
PDF_DIR = "/tmp/scpi_pdfs"
os.makedirs(PDF_DIR, exist_ok=True)

def gws_update(range_str, value):
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}),
           '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return 'updatedCells' in r.stdout

def read_sheet():
    cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get',
           '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})]
    r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace')
    return json.loads(r.stdout).get('values', [])

def web_fetch_html(url):
    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
        })
        resp = urllib.request.urlopen(req, timeout=15)
        return resp.read().decode('utf-8', errors='ignore')
    except:
        return None

def download_pdf(url, path):
    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
        })
        resp = urllib.request.urlopen(req, timeout=30)
        with open(path, 'wb') as f:
            f.write(resp.read())
        return True
    except:
        return False

def extract_pdf_text(pdf_path):
    try:
        result = subprocess.run(['pdftotext', '-layout', pdf_path, '-'], 
                              capture_output=True, text=True, timeout=30)
        return result.stdout
    except:
        return ""

def extract_ran_pge(text):
    """Extract RAN and PGE from bulletin text"""
    ran = None
    pge = None
    
    # RAN patterns
    for pat in [
        r'[Rr][eé]port\s+[aà]\s+nouveau[^:]*?:?\s*([0-9]+[\s,.]*[0-9]*)\s*%',
        r'[Rr][eé]port\s+[aà]\s+nouveau[^:]*?:?\s*([0-9]+[\s,.]*[0-9]*)\s*jours?',
        r'RAN[^0-9]*?(\d+[,.]?\d*)\s*(?:jours?|%)',
        r'Report[^:]*?:\s*(\d+[,.]?\d*)\s*(?:jours?|%|€)',
        r'RAN\s*(?:en\s*jours?)?\s*:\s*(\d+[,.]?\d*)',
    ]:
        m = re.search(pat, text, re.I)
        if m:
            ran = m.group(1).strip().replace(' ', '')
            break
    
    # PGE patterns
    for pat in [
        r'[Pp]rovision\s+(?:pour\s+)?gros\s+entretien[^:]*?:?\s*([0-9]+[\s,.]*[0-9]*)\s*(?:%|€|M€)',
        r'PGE[^0-9]*?(\d+[,.]?\d*)\s*(?:%|€|M€)',
        r'Provision\s+gros\s+entretien\s*:\s*([0-9]+[\s,.]*[0-9]*)',
    ]:
        m = re.search(pat, text, re.I)
        if m:
            pge = m.group(1).strip().replace(' ', '')
            break
    
    return ran, pge

# Read sheet
rows = read_sheet()

# Build scpi-lab URL map from sitemap
print("Fetching scpi-lab sitemap...")
req = urllib.request.Request("https://www.scpi-lab.com/sitemap.xml", headers={'User-Agent': 'Mozilla/5.0'})
resp = urllib.request.urlopen(req, timeout=15)
xml = resp.read().decode('utf-8')
lab_urls = re.findall(r'<loc>(https://www\.scpi-lab\.com/scpi/[^<]+)</loc>', xml)
# Filter to main pages only
lab_main = [u for u in lab_urls if '/information/' not in u]
print(f"Found {len(lab_main)} main SCPI pages")

def norm(s):
    s = s.lower().strip()
    for c in 'éèêë': s = s.replace(c, 'e')
    for c in 'âà': s = s.replace(c, 'a')
    for c in 'îï': s = s.replace(c, 'i')
    for c in 'ô': s = s.replace(c, 'o')
    for c in 'ûü': s = s.replace(c, 'u')
    return s.replace('ç','c').replace("'","").replace(" ","").replace("-","").replace("(","").replace(")","")

lab_map = {}
for u in lab_main:
    slug = u.split('/scpi/')[-1].split('?')[0].split('#')[0]
    name = re.sub(r'^scpi-', '', slug)
    name = re.sub(r'-\d+$', '', name)
    lab_map[norm(name)] = u

# Find rows needing N or O
needs = {}
for i in range(1, len(rows)):
    row = rows[i]
    r = i + 1
    scpi = row[1].strip() if len(row) > 1 else ''
    if not scpi: continue
    val_n = row[13].strip() if len(row) > 13 else ''
    val_o = row[14].strip() if len(row) > 14 else ''
    if val_n in ('', 'N/A', 'Non trouvé', '-') or val_o in ('', 'N/A', 'Non trouvé', '-'):
        needs[r] = (scpi, val_n in ('', 'N/A', 'Non trouvé', '-'), val_o in ('', 'N/A', 'Non trouvé', '-'))

print(f"Rows needing RAN/PGE: {len(needs)}")

total_n = 0
total_o = 0

for r, (scpi, need_n, need_o) in sorted(needs.items()):
    # Find scpi-lab page
    url = lab_map.get(norm(scpi))
    if not url:
        for k, v in lab_map.items():
            if norm(scpi) in k or k in norm(scpi):
                url = v
                break
    if not url:
        continue
    
    # Fetch the SCPI page to find bulletin PDF links
    html = web_fetch_html(url)
    if not html:
        continue
    
    # Find the latest bulletin PDF link
    # Pattern: /information/bt4-2025 or bt3-2025
    pdf_links = re.findall(r'href="(https://adm[^"]+\.pdf)"', html)
    bt_links = re.findall(r'href="(/scpi/[^"]+information/bt\d+-\d{4})"', html)
    
    # Get the bulletin page URL
    bt_page_url = None
    for bt in bt_links:
        if 'bt4-2025' in bt:
            bt_page_url = f"https://www.scpi-lab.com{bt}"
            break
        elif 'bt3-2025' in bt:
            bt_page_url = f"https://www.scpi-lab.com{bt}"
            break
    
    # Also find direct PDF links
    direct_pdf = None
    for pl in pdf_links:
        if 'BI-2025-T4' in pl or 'BI-2025-T3' in pl:
            direct_pdf = pl
            break
    
    # Try downloading the bulletin PDF
    pdf_path = f"{PDF_DIR}/scpi_{r}_{norm(scpi)}.pdf"
    text = ""
    
    if direct_pdf:
        if download_pdf(direct_pdf, pdf_path):
            text = extract_pdf_text(pdf_path)
    
    if not text and bt_page_url:
        # Fetch bulletin page and find PDF link
        bt_html = web_fetch_html(bt_page_url)
        if bt_html:
            pdfs = re.findall(r'href="(https://adm[^"]+\.pdf)"', bt_html)
            if pdfs:
                for p in pdfs:
                    if download_pdf(p, pdf_path):
                        text = extract_pdf_text(pdf_path)
                        if text:
                            break
    
    if not text:
        # Try the bulletin page text directly
        if bt_page_url:
            bt_html = web_fetch_html(bt_page_url)
            if bt_html:
                # Extract text from the page
                bt_html2 = re.sub(r'<script[^>]*>.*?</script>', '', bt_html, flags=re.S|re.I)
                bt_html2 = re.sub(r'<style[^>]*>.*?</style>', '', bt_html2, flags=re.S|re.I)
                bt_html2 = re.sub(r'<[^>]+>', ' ', bt_html2)
                text = bt_html2[:15000]
    
    if not text:
        continue
    
    ran, pge = extract_ran_pge(text)
    
    updates = []
    if need_n and ran:
        updates.append(('N', f"{ran} jours"))
    if need_o and pge:
        updates.append(('O', f"{pge}"))
    
    if not updates:
        # Still try to mark as "Non publié" if we found a bulletin but no RAN/PGE
        if len(text) > 500:
            if need_n:
                updates.append(('N', 'Non publié'))
            if need_o:
                updates.append(('O', 'Non publié'))
    
    if not updates:
        continue
    
    print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True)
    for col, value in updates:
        if gws_update(f"{col}{r}", value):
            print(f"{col}✅ ", end="", flush=True)
            if col == 'N': total_n += 1
            if col == 'O': total_o += 1
        else:
            print(f"{col}❌ ", end="", flush=True)
        time.sleep(0.12)
    print()
    time.sleep(0.3)

print(f"\n{'='*60}")
print(f"🏁 RAN filled: {total_n}, PGE filled: {total_o}")
print(f"Total: {total_n + total_o}")
PYEOF