import re def parse_chains_md(filepath: str) -> tuple[dict, dict, dict, list, dict, dict]: re_start_section = re.compile(r"^##\s*Chaînes\s+avec\s+risque\s+critique", re.IGNORECASE) re_other_h2 = re.compile(r"^##\s+(?!(Chaînes\s+avec\s+risque\s+critique))") re_chain_heading = re.compile(r"^###\s*(.+)\s*→\s*(.+)\s*→\s*(.+)$") re_phase = re.compile(r"^\*\s*(Assemblage|Fabrication|Minerai|Extraction|Traitement)", re.IGNORECASE) re_IHH = re.compile(r"IHH\s*[:]\s*([0-9]+(?:\.[0-9]+)?)") re_ISG = re.compile(r"ISG\s*combiné\s*[:]\s*([0-9]+(?:\.[0-9]+)?)|ISG\s*[:]\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE) re_ICS = re.compile(r"ICS\s*moyen\s*[:]\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE) re_IVC = re.compile(r"IVC\s*[:]\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE) produits, composants, mineraux, chains = {}, {}, {}, [] descriptions = {} details_sections = {} current_chain = None current_phase = None current_section = None in_section = False with open(filepath, encoding="utf-8") as f: for raw_line in f: line = raw_line.strip() if not in_section: if re_start_section.match(line): in_section = True continue if re_other_h2.match(line): break m_chain = re_chain_heading.match(line) if m_chain: prod, comp, miner = map(str.strip, m_chain.groups()) produits.setdefault(prod, {"IHH_Assemblage": None, "ISG_Assemblage": None}) composants.setdefault(comp, {"IHH_Fabrication": None, "ISG_Fabrication": None}) mineraux.setdefault(miner, { "ICS": None, "IVC": None, "IHH_Extraction": None, "ISG_Extraction": None, "IHH_Traitement": None, "ISG_Traitement": None }) chains.append({"produit": prod, "composant": comp, "minerai": miner}) current_chain = {"prod": prod, "comp": comp, "miner": miner} current_phase = None current_section = f"{prod} → {comp} → {miner}" descriptions[current_section] = "" continue if current_chain is None: continue m_phase = re_phase.match(line) if m_phase: current_phase = m_phase.group(1).capitalize() continue if current_phase: p = current_chain if current_phase == "Assemblage": if (m := re_IHH.search(line)): produits[p["prod"]]["IHH_Assemblage"] = float(m.group(1)) continue if (m := re_ISG.search(line)): raw = m.group(1) or m.group(2) produits[p["prod"]]["ISG_Assemblage"] = float(raw) continue if current_phase == "Fabrication": if (m := re_IHH.search(line)): composants[p["comp"]]["IHH_Fabrication"] = float(m.group(1)) continue if (m := re_ISG.search(line)): raw = m.group(1) or m.group(2) composants[p["comp"]]["ISG_Fabrication"] = float(raw) continue if current_phase == "Minerai": if (m := re_ICS.search(line)): mineraux[p["miner"]]["ICS"] = float(m.group(1)) continue if (m := re_IVC.search(line)): mineraux[p["miner"]]["IVC"] = float(m.group(1)) continue if current_phase == "Extraction": if (m := re_IHH.search(line)): mineraux[p["miner"]]["IHH_Extraction"] = float(m.group(1)) continue if (m := re_ISG.search(line)): raw = m.group(1) or m.group(2) mineraux[p["miner"]]["ISG_Extraction"] = float(raw) continue if current_phase == "Traitement": if (m := re_IHH.search(line)): mineraux[p["miner"]]["IHH_Traitement"] = float(m.group(1)) continue if (m := re_ISG.search(line)): raw = m.group(1) or m.group(2) mineraux[p["miner"]]["ISG_Traitement"] = float(raw) continue else: if current_section: descriptions[current_section] += raw_line # Parse detailed sections from the complete file with open(filepath, encoding="utf-8") as f: content = f.read() # Extract sections using regex patterns lines = content.split('\n') # Find section boundaries operations_start = None minerais_start = None for i, line in enumerate(lines): if line.strip() == "## Détails des opérations": operations_start = i elif line.strip() == "## Détails des minerais": minerais_start = i if operations_start is not None: # Parse operations section (assemblage and fabrication) operations_end = minerais_start if minerais_start else len(lines) operations_lines = lines[operations_start:operations_end] current_section_name = None current_content = [] for line in operations_lines: if line.startswith("### ") and " et " in line: # Save previous section if current_section_name and current_content: details_sections[current_section_name] = '\n'.join(current_content) # Start new section section_title = line.replace("### ", "").strip() if " et Assemblage" in section_title: product_name = section_title.replace(" et Assemblage", "").strip() current_section_name = f"{product_name}_assemblage" elif " et Fabrication" in section_title: component_name = section_title.replace(" et Fabrication", "").strip() current_section_name = f"{component_name}_fabrication" current_content = [] elif current_section_name: current_content.append(line) # Save last section if current_section_name and current_content: details_sections[current_section_name] = '\n'.join(current_content) if minerais_start is not None: # Parse minerais section minerais_lines = lines[minerais_start:] current_minerai = None current_section_type = "general" current_content = [] for line in minerais_lines: if line.startswith("### ") and "→" not in line and " et " not in line: # Save previous section if current_minerai and current_content: details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content) # Start new minerai current_minerai = line.replace("### ", "").strip() current_section_type = "general" current_content = [] elif line.startswith("#### Extraction"): # Save previous section if current_minerai and current_content: details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content) current_section_type = "extraction" current_content = [] elif line.startswith("#### Traitement"): # Save previous section if current_minerai and current_content: details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content) current_section_type = "traitement" current_content = [] elif line.startswith("## ") and current_minerai: # End of minerais section if current_content: details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content) break elif current_minerai: current_content.append(line) # Save last section if current_minerai and current_content: details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content) return produits, composants, mineraux, chains, descriptions, details_sections