#!/usr/bin/env python3 """SCPI Batch Filler v5 — scpi-lab.com extraction for remaining gaps. Uses web_fetch to get scpi-lab pages (no JS rendering needed).""" import json, re, subprocess, time, sys, urllib.request SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM" def gws_update(range_str, value): cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update', '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}), '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})] r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace') return 'updatedCells' in r.stdout def read_sheet(): cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get', '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})] r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace') return json.loads(r.stdout).get('values', []) def web_fetch_text(url, max_chars=10000): """Fetch URL and extract readable text""" try: req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36', 'Accept': 'text/html' }) resp = urllib.request.urlopen(req, timeout=15) html = resp.read().decode('utf-8', errors='ignore') # Simple text extraction html = re.sub(r']*>.*?', '', html, flags=re.S|re.I) html = re.sub(r']*>.*?', '', html, flags=re.S|re.I) html = re.sub(r'', '\n', html, flags=re.I) html = re.sub(r']*>', '\n', html, flags=re.I) text = re.sub(r'<[^>]+>', ' ', html) for old, new in [(' ',' '),('&','&'),('€','€'),('’',"'"), ('<','<'),('>','>'),(''',"'"),('"','"')]: text = text.replace(old, new) text = re.sub(r'[ \t]+', ' ', text) text = re.sub(r'\n\s*\n+', '\n', text) return text[:max_chars] except: return None # scpi-lab.com URL patterns - need to search for each SCPI # Use the search on scpi-lab def find_scpi_lab_url(scpi_name): """Search scpi-lab for a SCPI page URL""" try: # Direct search query = f"site:scpi-lab.com {scpi_name} SCPI" url = f"http://192.168.1.127:8081/search?q={urllib.request.quote(query)}&format=json" req = urllib.request.Request(url) resp = urllib.request.urlopen(req, timeout=10) data = json.loads(resp.read().decode()) for r in data.get('results', [])[:3]: u = r.get('url', '') if 'scpi-lab.com/scpi/' in u and '/information/' not in u: return u except: pass return None # Read sheet rows = read_sheet() # Build needs needs = {} for i in range(1, len(rows)): row = rows[i] r = i + 1 scpi = row[1].strip() if len(row) > 1 else '' if not scpi: continue empty = {} for j in range(3, 25): val = row[j].strip() if j < len(row) else '' if val in ('', 'N/A', 'Non trouvé', '-'): col = chr(65+j) if col not in {'N', 'O'}: empty[col] = j if empty: needs[r] = (scpi, empty) print(f"Rows with fillable gaps: {len(needs)}") # Instead of searching, let's try fetching scpi-lab directly with known slugs # First, search for all SCPIs on scpi-lab using their search # Or use the sitemap print("Fetching scpi-lab sitemap...") try: req = urllib.request.Request("https://www.scpi-lab.com/sitemap.xml", headers={'User-Agent': 'Mozilla/5.0'}) resp = urllib.request.urlopen(req, timeout=15) xml = resp.read().decode('utf-8') lab_urls = re.findall(r'(https://www\.scpi-lab\.com/scpi/[^<]+)', xml) # Filter to main SCPI pages (not /information/ subpages) lab_urls = [u for u in lab_urls if '/information/' not in u] print(f"Found {len(lab_urls)} SCPI pages on scpi-lab") except Exception as e: lab_urls = [] print(f"Sitemap error: {e}") # Build name -> URL map def normalize(s): s = s.lower().strip() for c in 'éèêë': s = s.replace(c, 'e') for c in 'âà': s = s.replace(c, 'a') for c in 'îï': s = s.replace(c, 'i') for c in 'ô': s = s.replace(c, 'o') for c in 'ûü': s = s.replace(c, 'u') s = s.replace('ç', 'c').replace("'", "").replace(" ", "").replace("-", "").replace("(", "").replace(")", "") return s lab_map = {} for u in lab_urls: # Extract SCPI name from URL: scpi-lab.com/scpi/scpi-name-id m = re.search(r'/scpi/(?:scpi-)?([^/?#]+?)(?:-\d+)?$', u) if m: slug = m.group(1) norm = normalize(slug.replace('scpi-', '')) lab_map[norm] = u print(f"Lab map entries: {len(lab_map)}") # Process rows total = 0 for r, (scpi, empty) in sorted(needs.items()): norm = normalize(scpi) url = lab_map.get(norm) if not url: # Try partial match for k, v in lab_map.items(): if norm in k or k in norm: url = v break if not url: continue text = web_fetch_text(url, 8000) if not text: continue updates = [] # K: endettement - look for "Endettement" section m = re.search(r'endettement[^0-9]*?(\d+[,.]?\d*)\s*%', text, re.I) if m and 'K' in empty: val = m.group(1) try: num = float(val.replace(',', '.')) if 0.1 <= num <= 100: updates.append(('K', f"{val}%")) except: pass # M: parts en attente - look for "Parts en attente" m = re.search(r'(?:Parts?|attente)\s+(?:en\s+attente|de\s+retrait)[^0-9]*(\d[\d\s]*)', text, re.I) if m and 'M' in empty: val = m.group(1).strip().replace(' ', '') if val and val != '0': updates.append(('M', val)) # U: TRI 10 ans m = re.search(r'TRI\s+(?:à\s+)?10\s+ans?[^0-9]*?([−–-]?\d+[,.]?\d*)\s*%', text, re.I) if m and 'U' in empty: val = m.group(1).replace('−', '-').replace('–', '-') try: num = float(val.replace(',', '.')) if -50 <= num <= 50: updates.append(('U', f"{val}%")) except: pass # T: PGA / taux de distribution m = re.search(r'(?:Distribution\s+brute?\s+2025|taux\s+de\s+distribution)[^0-9]*?(\d+[,.]?\d*)\s*%', text, re.I) if m and 'T' in empty: val = m.group(1) try: num = float(val.replace(',', '.')) if 0.1 <= num <= 30: updates.append(('T', f"{val}%")) except: pass if not updates: continue print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True) row_cells = 0 for col, value in updates: if gws_update(f"{col}{r}", value): print(f"{col}✅ ", end="", flush=True) row_cells += 1 total += 1 else: print(f"{col}❌ ", end="", flush=True) time.sleep(0.12) print(f"({row_cells})") time.sleep(0.3) print(f"\n{'='*60}") print(f"🏁 TOTAL filled: {total}") PYEOF