#!/usr/bin/env python3 """ SCPI Batch Filler — Master script for scpi-scraper agent. Uses ideal-investisseur JSON extraction + commentbieninvestir as fallback. Processes ALL rows with gaps. KEY DISCOVERY: ideal-investisseur pages contain structured JSON in ', html, re.S) for block in blocks: try: data = json.loads(block) if 'latest' in data: return data except: pass # No JSON block — fall back to HTML text extraction return extract_from_html(html) except: return None def extract_from_html(html): """Fallback: extract from HTML text""" html = re.sub(r']*>.*?', '', html, flags=re.S|re.I) html = re.sub(r']*>.*?', '', html, flags=re.S|re.I) html = re.sub(r'', '\n', html, flags=re.I) html = re.sub(r']*>', '\n', html, flags=re.I) text = re.sub(r'<[^>]+>', ' ', html) for old, new in [(' ',' '),('&','&'),('€','€'),('’',"'")]: text = text.replace(old, new) text = re.sub(r'[ \t]+', ' ', text) text = re.sub(r'\n\s*\n+', '\n', text) d = {'_source': 'html'} m = re.search(r'Capital\s+(fixe|variable)', text, re.I) if m: d['capital_type'] = m.group(1).capitalize() m = re.search(r"Taux\s+d'endettement\s+(\d+[,.]?\d*)\s*%", text, re.I) if m: d['taux_endettement'] = m.group(1) m = re.search(r"\bTOF\s+(\d+[,.]?\d*)\s*%", text, re.I) if m: d['tof'] = m.group(1) m = re.search(r'Prix\s+de\s+souscription\s+(\d[\d\s,.]*€)', text, re.I) if m: d['prix_souscription'] = m.group(1).strip().replace(' €','').replace(' ','') m = re.search(r'Valeur\s+de\s+reconstitution\s+(\d[\d\s,.]*€)', text, re.I) if m: d['valeur_reconstitution'] = m.group(1).strip().replace(' €','').replace(' ','') m = re.search(r"Taux\s+de\s+distribution\s*(?:\d{4})?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%", text, re.I) if m: d['taux_distribution'] = m.group(1) m = re.search(r"TRI\s+(?:à\s+)?10\s+ans?\s*:?\s*[−–-]?\s*(\d+[,.]?\d*)\s*%", text, re.I) if m: d['tri_10_ans'] = m.group(1) if re.search(r'Label\s+ISR\s*(Non|No)', text, re.I): d['label_isr'] = False elif re.search(r'Label\s+ISR', text, re.I): d['label_isr'] = True m = re.search(r'Capitalisation\s+(\d+[,.]?\d*)\s*(M€|Mds?€)', text, re.I) if m: d['capitalisation'] = m.group(1) + ' ' + m.group(2) m = re.search(r"(\d[\d\s]*)\s*parts?\s*en\s*(?:attente\s*de\s*)?retrait", text, re.I) if m: d['parts_en_attente_retrait'] = m.group(1).strip().replace(' ','') m = re.search(r"(?:Commission|Frais)\s+de\s+souscription\s+(\d+[,.]?\d*)\s*%", text, re.I) if m: d['comm_souscription'] = m.group(1) m = re.search(r"(?:Commission|Frais)\s+de\s+gestion\s+(\d+[,.]?\d*)\s*%", text, re.I) if m: d['comm_gestion'] = m.group(1) m = re.search(r"(?:Délai|Durée)\s+de\s+jouissance\s*:?\s*(\d+)\s*mois", text, re.I) if m: d['delai_jouissance'] = m.group(1) + ' mois' m = re.search(r"(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)", text, re.I) if m: d['souscription_min'] = m.group(1).strip() m = re.search(r"(?:Nombre\s+de\s+parts|Parts?\s+émises?)\s*:?\s*(\d[\d\s]*)", text, re.I) if m: d['nb_parts'] = m.group(1).strip().replace(' ','') return d if len(d) > 2 else None def map_to_cols(data): """Map extracted data to sheet column letters""" d = {} latest = data.get('latest', {}) is_json = '_source' not in data if is_json: # From JSON v = latest.get('taux_endettement') if v and str(v) not in ('0', '0.00', 'null'): d['K'] = str(v) + '%' v = latest.get('tof') if v and str(v) not in ('0', '0.00', 'null'): d['S'] = str(v) + '%' v = latest.get('prix_souscription') if v and str(v) != '0': d['V'] = str(v) + ' €' v = latest.get('valeur_reconstitution') if v and str(v) not in ('0', '0.00', 'null'): d['W'] = str(v) + ' €' v = latest.get('taux_distribution') if v and str(v) not in ('0', '0.00', 'null'): d['T'] = str(v) + '%' v = latest.get('tri_10_ans') if v and str(v) not in ('0', '0.00', 'null'): d['U'] = str(v) + '%' v = latest.get('label_isr') if v is not None: d['G'] = 'Oui' if str(v).lower() not in ('null','','non','0') else 'Non' v = latest.get('capitalisation') if v: d['J'] = str(v) v = latest.get('parts_en_attente_retrait') if v and str(v) not in ('0', 'null', ''): d['M'] = str(v) v = latest.get('nb_parts') or latest.get('nombre_parts') if v and str(v) not in ('0', 'null', ''): d['L'] = str(v) v = latest.get('comm_souscription') if v: d['X'] = str(v) + '% TTC' v = latest.get('comm_gestion') if v: d['Y'] = str(v) + '% TTC' v = latest.get('delai_jouissance') if v: d['Q'] = str(v) v = latest.get('souscription_min') if v: d['H'] = str(v) v = latest.get('capital_type') if v: d['D'] = str(v) v = latest.get('fiscalite') if v: d['P'] = str(v) # From scpi data scpi = data.get('scpi', {}) v = scpi.get('date_creation') if v: d['I'] = str(v) else: # From HTML fallback for key, col in [ ('taux_endettement', 'K'), ('tof', 'S'), ('taux_distribution', 'T'), ('tri_10_ans', 'U'), ('capitalisation', 'J'), ('comm_souscription', 'X'), ('comm_gestion', 'Y'), ('delai_jouissance', 'Q'), ('souscription_min', 'H'), ('nb_parts', 'L'), ('capital_type', 'D'), ]: v = data.get(key) if v: if col in ('K','S','T','U'): d[col] = str(v) + '%' elif col in ('X','Y'): d[col] = str(v) + '% TTC' else: d[col] = str(v) v = data.get('label_isr') if v is not None: d['G'] = 'Oui' if v else 'Non' v = data.get('parts_en_attente_retrait') if v and str(v) != '0': d['M'] = str(v) v = data.get('prix_souscription') if v: d['V'] = str(v) + ' €' v = data.get('valeur_reconstitution') if v and str(v) != '0': d['W'] = str(v) + ' €' return d # === MAIN === rows = read_sheet() needs = {} for i in range(1, len(rows)): row = rows[i] r = i + 1 scpi = row[1].strip() if len(row) > 1 else '' if not scpi: continue empty = {} for j in range(3, 25): val = row[j].strip() if j < len(row) else '' if val in ('', 'N/A', 'Non trouvé', '-'): col = chr(65+j) empty[col] = j if empty: needs[r] = (scpi, empty) print(f"\nRows needing fixes: {len(needs)}") print(f"URLs in map: {len(URL_MAP)}") fetcher = Fetcher(auto_match=False) total_cells = 0 total_rows = 0 results = {} for r, (scpi, empty) in sorted(needs.items()): if scpi not in URL_MAP: results[scpi] = 'no_url' continue url = URL_MAP[scpi] data = fetch_json(url, fetcher) if not data: results[scpi] = 'no_data' continue mapped = map_to_cols(data) updates = [(col, mapped[col]) for col in empty if col in mapped] if not updates: results[scpi] = 'no_match' continue print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True) row_cells = 0 for col, value in updates: if gws_update(f"{col}{r}", value): print(f"{col}✅ ", end="", flush=True) row_cells += 1 time.sleep(0.12) else: print(f"{col}❌ ", end="", flush=True) print(f"({row_cells})") total_rows += 1 total_cells += row_cells results[scpi] = f'{row_cells} cells' time.sleep(0.25) # Summary print(f"\n\n{'='*60}") print(f"🏁 TOTAL: {total_rows} rows, {total_cells} cells filled") no_url = [k for k,v in results.items() if v == 'no_url'] no_data = [k for k,v in results.items() if v == 'no_data'] no_match = [k for k,v in results.items() if v == 'no_match'] print(f"\nNo URL ({len(no_url)}): {', '.join(no_url[:10])}") print(f"No data ({len(no_data)}): {', '.join(no_data[:10])}") print(f"No match ({len(no_match)}): {', '.join(no_match[:10])}") # Save results for agent with open('/tmp/scpi_batch_results.json', 'w') as f: json.dump(results, f, ensure_ascii=False, indent=2)