Code/app/plan_d_action/utils/data/data_processing.py
2025-06-03 14:53:41 +02:00

193 lines
8.5 KiB
Python

import re
def parse_chains_md(filepath: str) -> tuple[dict, dict, dict, list, dict, dict]:
re_start_section = re.compile(r"^##\s*Chaînes\s+avec\s+risque\s+critique", re.IGNORECASE)
re_other_h2 = re.compile(r"^##\s+(?!(Chaînes\s+avec\s+risque\s+critique))")
re_chain_heading = re.compile(r"^###\s*(.+)\s*→\s*(.+)\s*→\s*(.+)$")
re_phase = re.compile(r"^\*\s*(Assemblage|Fabrication|Minerai|Extraction|Traitement)", re.IGNORECASE)
re_IHH = re.compile(r"IHH\s*[:]\s*([0-9]+(?:\.[0-9]+)?)")
re_ISG = re.compile(r"ISG\s*combiné\s*[:]\s*([0-9]+(?:\.[0-9]+)?)|ISG\s*[:]\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE)
re_ICS = re.compile(r"ICS\s*moyen\s*[:]\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE)
re_IVC = re.compile(r"IVC\s*[:]\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE)
produits, composants, mineraux, chains = {}, {}, {}, []
descriptions = {}
details_sections = {}
current_chain = None
current_phase = None
current_section = None
in_section = False
with open(filepath, encoding="utf-8") as f:
for raw_line in f:
line = raw_line.strip()
if not in_section:
if re_start_section.match(line):
in_section = True
continue
if re_other_h2.match(line):
break
m_chain = re_chain_heading.match(line)
if m_chain:
prod, comp, miner = map(str.strip, m_chain.groups())
produits.setdefault(prod, {"IHH_Assemblage": None, "ISG_Assemblage": None})
composants.setdefault(comp, {"IHH_Fabrication": None, "ISG_Fabrication": None})
mineraux.setdefault(miner, {
"ICS": None, "IVC": None,
"IHH_Extraction": None, "ISG_Extraction": None,
"IHH_Traitement": None, "ISG_Traitement": None
})
chains.append({"produit": prod, "composant": comp, "minerai": miner})
current_chain = {"prod": prod, "comp": comp, "miner": miner}
current_phase = None
current_section = f"{prod}{comp}{miner}"
descriptions[current_section] = ""
continue
if current_chain is None:
continue
m_phase = re_phase.match(line)
if m_phase:
current_phase = m_phase.group(1).capitalize()
continue
if current_phase:
p = current_chain
if current_phase == "Assemblage":
if (m := re_IHH.search(line)):
produits[p["prod"]]["IHH_Assemblage"] = float(m.group(1))
continue
if (m := re_ISG.search(line)):
raw = m.group(1) or m.group(2)
produits[p["prod"]]["ISG_Assemblage"] = float(raw)
continue
if current_phase == "Fabrication":
if (m := re_IHH.search(line)):
composants[p["comp"]]["IHH_Fabrication"] = float(m.group(1))
continue
if (m := re_ISG.search(line)):
raw = m.group(1) or m.group(2)
composants[p["comp"]]["ISG_Fabrication"] = float(raw)
continue
if current_phase == "Minerai":
if (m := re_ICS.search(line)):
mineraux[p["miner"]]["ICS"] = float(m.group(1))
continue
if (m := re_IVC.search(line)):
mineraux[p["miner"]]["IVC"] = float(m.group(1))
continue
if current_phase == "Extraction":
if (m := re_IHH.search(line)):
mineraux[p["miner"]]["IHH_Extraction"] = float(m.group(1))
continue
if (m := re_ISG.search(line)):
raw = m.group(1) or m.group(2)
mineraux[p["miner"]]["ISG_Extraction"] = float(raw)
continue
if current_phase == "Traitement":
if (m := re_IHH.search(line)):
mineraux[p["miner"]]["IHH_Traitement"] = float(m.group(1))
continue
if (m := re_ISG.search(line)):
raw = m.group(1) or m.group(2)
mineraux[p["miner"]]["ISG_Traitement"] = float(raw)
continue
else:
if current_section:
descriptions[current_section] += raw_line
# Parse detailed sections from the complete file
with open(filepath, encoding="utf-8") as f:
content = f.read()
# Extract sections using regex patterns
lines = content.split('\n')
# Find section boundaries
operations_start = None
minerais_start = None
for i, line in enumerate(lines):
if line.strip() == "## Détails des opérations":
operations_start = i
elif line.strip() == "## Détails des minerais":
minerais_start = i
if operations_start is not None:
# Parse operations section (assemblage and fabrication)
operations_end = minerais_start if minerais_start else len(lines)
operations_lines = lines[operations_start:operations_end]
current_section_name = None
current_content = []
for line in operations_lines:
if line.startswith("### ") and " et " in line:
# Save previous section
if current_section_name and current_content:
details_sections[current_section_name] = '\n'.join(current_content)
# Start new section
section_title = line.replace("### ", "").strip()
if " et Assemblage" in section_title:
product_name = section_title.replace(" et Assemblage", "").strip()
current_section_name = f"{product_name}_assemblage"
elif " et Fabrication" in section_title:
component_name = section_title.replace(" et Fabrication", "").strip()
current_section_name = f"{component_name}_fabrication"
current_content = []
elif current_section_name:
current_content.append(line)
# Save last section
if current_section_name and current_content:
details_sections[current_section_name] = '\n'.join(current_content)
if minerais_start is not None:
# Parse minerais section
minerais_lines = lines[minerais_start:]
current_minerai = None
current_section_type = "general"
current_content = []
for line in minerais_lines:
if line.startswith("### ") and "" not in line and " et " not in line:
# Save previous section
if current_minerai and current_content:
details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content)
# Start new minerai
current_minerai = line.replace("### ", "").strip()
current_section_type = "general"
current_content = []
elif line.startswith("#### Extraction"):
# Save previous section
if current_minerai and current_content:
details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content)
current_section_type = "extraction"
current_content = []
elif line.startswith("#### Traitement"):
# Save previous section
if current_minerai and current_content:
details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content)
current_section_type = "traitement"
current_content = []
elif line.startswith("## ") and current_minerai:
# End of minerais section
if current_content:
details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content)
break
elif current_minerai:
current_content.append(line)
# Save last section
if current_minerai and current_content:
details_sections[f"{current_minerai}_{current_section_type}"] = '\n'.join(current_content)
return produits, composants, mineraux, chains, descriptions, details_sections