#!/usr/bin/env python3 """ Pass 3: ideal-investisseur.fr — focus on K (endettement), L (nb parts), M (parts retrait), N (RAN), O (PGE), Q (jouissance), S (TOF), T (PGA), U (TRI), X/Y (commissions) Uses Scrapling Fetcher for reliable HTML extraction. """ import json, re, subprocess, time, sys sys.path.insert(0, '/home/shingokuga/.openclaw/workspace/skills/scrapling-web-scraper/venv/lib/python3.12/site-packages') from scrapling import Fetcher SHEET_ID = "1dBGv3jIsFDLMJInANzyLvUAF0HYb5iRZ_RQljo-i7XM" def read_sheet(): cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'get', '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": "Listing sites!A1:Y111"})] r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace') return json.loads(r.stdout).get('values', []) def gws_update(range_str, value): cmd = ['gws', 'sheets', 'spreadsheets', 'values', 'update', '--params', json.dumps({"spreadsheetId": SHEET_ID, "range": f"Listing sites!{range_str}", "valueInputOption": "USER_ENTERED"}), '--json', json.dumps({"range": f"Listing sites!{range_str}", "values": [[value]]})] r = subprocess.run(cmd, capture_output=True, text=True, cwd='/home/shingokuga/.openclaw/workspace') return 'updatedCells' in r.stdout def html_to_text(page): html = page.text html = re.sub(r']*>.*?', '', html, flags=re.S|re.I) html = re.sub(r']*>.*?', '', html, flags=re.S|re.I) html = re.sub(r'', '\n', html, flags=re.I) html = re.sub(r']*>', '\n', html, flags=re.I) text = re.sub(r'<[^>]+>', ' ', html) text = re.sub(r' ', ' ', text) text = re.sub(r'&', '&', text) text = re.sub(r'[ \t]+', ' ', text) text = re.sub(r'\n\s*\n', '\n', text) return text def slugify(name): s = name.lower().strip() for a,b in [('é','e'),('è','e'),('ê','e'),('ë','e'),('â','a'),('à','a'),('î','i'),('ï','i'),('ô','o'),('û','u'),('ü','u'),('ç','c'),("'",""),('(',""),(')',""),(' ',"-")]: s = s.replace(a,b) return s # Ideal-investisseur URL patterns def ii_url(name): slug = slugify(name) # Most SCPI pages use the pattern: /scpi-avis/{slug}-{id}.html # We'll search for them return None # We need to find the right URLs first # Read current sheet state rows = read_sheet() headers = rows[0] # Build map of what needs fixing needs = {} for i in range(1, len(rows)): row = rows[i] r = i + 1 scpi = row[1].strip() if len(row) > 1 else '' if not scpi: continue empty = {} for j in range(3, 25): val = row[j].strip() if j < len(row) else '' if val in ('', 'N/A', 'Non trouvé', '-'): col = chr(65+j) empty[col] = j if empty: needs[r] = (scpi, empty) print(f"Rows needing fixes: {len(needs)}") # Strategy: use web_search to find ideal-investisseur URLs for SCPIs not covered by CBI # For now, try direct ideal-investisseur scraping with known URL patterns fetcher = Fetcher(auto_match=False) total_cells = 0 total_rows = 0 # Known ideal-investisseur URL mappings (from previous sessions) # Try searching by slug pattern II_BASE = "https://www.ideal-investisseur.fr/scpi-avis/" for r, (scpi, empty) in sorted(needs.items()): slug = slugify(scpi) # Try various URL patterns candidates = [ f"{II_BASE}{slug}.html", f"{II_BASE}{slug}-avis.html", ] # Also try with number suffix (common pattern on ideal-investisseur) text = None page_url = None for url in candidates: try: page = fetcher.get(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'}) if page.status == 200 and len(page.text) > 2000: text = html_to_text(page) page_url = url break except: pass if not text: # Try Google search to find the page try: from urllib.request import urlopen, Request import urllib.parse q = urllib.parse.quote(f"site:ideal-investisseur.fr scpi {scpi}") search_url = f"https://www.google.com/search?q={q}&num=3" req = Request(search_url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'}) resp = urlopen(req, timeout=10) search_html = resp.read().decode('utf-8', errors='ignore') urls = re.findall(r'https://www\.ideal-investisseur\.fr/scpi-avis/[^"&\s]+', search_html) if urls: found_url = urls[0] page = fetcher.get(found_url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)'}) if page.status == 200 and len(page.text) > 2000: text = html_to_text(page) page_url = found_url except: pass if not text: continue # Extract data d = {} # K: Endettement m = re.search(r'Taux\s+d\'endettement\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I) if m: d['K'] = m.group(1).replace(',', '.') + '%' # L: Nombre parts m = re.search(r'(?:Nombre\s+de\s+parts|Parts?\s+émises?)\s*:?\s*(\d[\d\s]*)', text, re.I) if m: d['L'] = m.group(1).strip().replace(' ', '') # M: Parts retrait m = re.search(r'(\d[\d\s]*)\s*parts?\s*en\s*attente\s*de\s*retrait', text, re.I) if m: d['M'] = m.group(1).strip().replace(' ', '') # N: RAN m = re.search(r'Report\s+à\s+nouveau\s*:?\s*(\d+\s*jours?\s*de\s*distribution)', text, re.I) if m: d['N'] = m.group(1).strip() else: m = re.search(r'Report\s+à\s+nouveau\s*:?\s*([\d\s,.]+\s*€)', text, re.I) if m: d['N'] = m.group(1).strip() # O: PGE m = re.search(r'(?:Provision|Réserve)\s+pour\s+gros\s+entretien\s*:?\s*([\d\s,.]+\s*€)', text, re.I) if m: d['O'] = m.group(1).strip() # Q: Delai jouissance m = re.search(r'Délai\s+de\s+jouissance\s*:?\s*(\d+\s*mois)', text, re.I) if m: d['Q'] = m.group(1).strip() # S: TOF m = re.search(r'(?:TOF|Taux\s+d\'occupation\s+financière)\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I) if m: d['S'] = m.group(1).replace(',', '.') + '%' # T: PGA m = re.search(r'(?:Performance\s+globale\s+annuelle|PGA)\s*(?:\d{4})?\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I) if m: d['T'] = m.group(1).replace(',', '.') + '%' # U: TRI m = re.search(r'TRI\s+(?:10\s+ans|à\s+10\s+ans)\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I) if m: d['U'] = m.group(1).replace(',', '.') + '%' # V: Prix souscription m = re.search(r'(?:Valeur|Prix)\s+de\s+souscription\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I) if m: d['V'] = m.group(1).strip() # W: Valeur reconstitution m = re.search(r'Valeur\s+de\s+reconstitution\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I) if m: d['W'] = m.group(1).strip() # X: Commission souscription m = re.search(r'(?:Commission|Frais)\s+de\s+souscription\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I) if m: d['X'] = m.group(1).replace(',', '.') + '% TTC' # Y: Commission gestion m = re.search(r'(?:Commission|Frais)\s+de\s+gestion\s*:?\s*(\d+[,.]?\d*)\s*%', text, re.I) if m: d['Y'] = m.group(1).replace(',', '.') + '% TTC' # G: Label ISR if re.search(r'Label\s+ISR', text, re.I): d['G'] = 'Oui' # D: Type capital m = re.search(r'Capital\s+(fixe|variable)', text, re.I) if m: d['D'] = m.group(1).capitalize() # H: Souscription min m = re.search(r'(?:Souscription|Investissement)\s+minimum\s*:?\s*(\d[\d\s,.]*\s*€)', text, re.I) if m: d['H'] = m.group(1).strip() # P: Fiscalite if re.search(r'impôt\s+sur\s+les\s+sociétés\s+allemand|IS\s*15,825', text, re.I): d['P'] = 'Allemande (IS 15,825%)' elif re.search(r'revenus\s+fonciers|PFU|prélèvement\s+forfaitaire', text, re.I): d['P'] = 'Française (revenus fonciers / PFU 30%)' # E/F: Geo/Secteur m = re.search(r'[Rr]épartition\s*géographique\s*:?\s*\n((?:[^\n]+\n){2,10})', text) if m: geo = m.group(1) places = re.findall(r'([A-ZÀÂÉÈÊËÎÏÔÛÜÇ][a-zàâéèêëîïôûüç\s]+?)\s+(\d+[,.]?\d*)\s*%', geo) if places: d['E'] = ', '.join(f'{p.strip()} {v}%' for p, v in places[:6]) m = re.search(r'[Rr]épartition\s*(?:par\s+)?secteur\s*:?\s*\n((?:[^\n]+\n){2,10})', text) if m: sect = m.group(1) acts = re.findall(r'([\w\séèêâàîïôûüç/]+?)\s+(\d+[,.]?\d*)\s*%', sect) if acts: d['F'] = ', '.join(f'{a.strip()} {v}%' for a, v in acts[:6]) # Fill only empty cells updates = [] for col, col_idx in empty.items(): if col in d: updates.append((col, d[col])) if not updates: continue print(f"Row {r:3d} ({scpi:35s}): ", end="", flush=True) row_cells = 0 for col, value in updates: if gws_update(f"{col}{r}", value): print(f"{col}✅ ", end="", flush=True) row_cells += 1 time.sleep(0.15) else: print(f"{col}❌ ", end="", flush=True) print(f"({row_cells})") total_rows += 1 total_cells += row_cells time.sleep(0.3) print(f"\n\n🏁 TOTAL: {total_rows} rows, {total_cells} cells filled via ideal-investisseur")