Feat: convert stagging2gold

2024-03-03 06:39:27 +01:00
parent 9e5541a770
commit 25ede1789a
3 changed files with 224 additions and 88 deletions
--- a/scripts/flux.py
+++ b/scripts/flux.py
@@ -0,0 +1,109 @@
 import logging
 from abc import abstractmethod
 from collections.abc import Callable
 from pathlib import Path
 import pandas as pd
 from pydantic import BaseModel
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 class Source(BaseModel):
    filename: str
    @abstractmethod
    def get_df(self) -> pd.DataFrame:
        raise NotImplementedError
 class ExcelSource(Source):
    sheet_name: str
    def get_df(self, base_path: Path) -> pd.DataFrame:
        filepath = base_path / self.filename
        logger.debug(f"Get content of {filepath}")
        return pd.read_excel(filepath, sheet_name=self.sheet_name)
 class CSVSource(Source):
    options: dict
    def get_df(self, base_path: Path) -> pd.DataFrame:
        filepath = base_path / self.filename
        logger.debug(f"Get content of {filepath}")
        return pd.read_csv(filepath, **self.options)
 class Flux(BaseModel):
    sources: list[Source]
    transformation: Callable
    extra_kwrds: dict = {}
 def to_csv(df, dest_basename):
    dest = dest_basename.parent / (dest_basename.name + ".csv")
    if dest.exists():
        df.to_csv(dest, mode="a", header=False, index=False)
    else:
        df.to_csv(dest, index=False)
    return dest
 def write_split_by(
    df: pd.DataFrame, column: str, dest_path: Path, writing_func
 ) -> list[Path]:
    wrote_files = []
    for col_value in df[column].unique():
        filtered_df = df[df[column] == col_value]
        dest_basename = dest_path / f"{col_value}"
        dest = writing_func(filtered_df, dest_basename)
        wrote_files.append(dest)
    return wrote_files
 def extract_sources(sources: list[Source], base_path: Path = Path()):
    for src in sources:
        if "*" in src.filename:
            expanded_src = [
                src.model_copy(update={"filename": p}) for p in Path.glob(src.filename)
            ]
            yield from extract_sources(expanded_src, base_path)
        else:
            filepath = base_path / src.filename
            assert filepath.exists
            yield src.filename, src.get_df(base_path)
 def split_duplicates(
    df, origin: str, duplicated: dict[str, pd.DataFrame]
 ) -> [pd.DataFrame, dict[str, pd.DataFrame]]:
    duplicates = df.duplicated()
    no_duplicates = df[~duplicates]
    duplicated[origin] = df[duplicates]
    return no_duplicates, duplicated
 def consume_fluxes(
    fluxes: dict[str, Flux], origin_path: Path, dest_path: Path, writing_func=to_csv
 ):
    duplicated = {}
    wrote_files = []
    for name, flux in fluxes.items():
        print(name)
        logger.info(f"Processing flux {name}")
        src_df = []
        for filename, df in extract_sources(flux.sources, origin_path):
            df, duplicated = split_duplicates(df, str(filename), duplicated)
            src_df.append(df)
        df = flux.transformation(src_df, **flux.extra_kwrds)
        files = write_split_by(df, "Année", dest_path, writing_func)
        wrote_files += files
    return wrote_files
--- a/scripts/history_stagging.py
+++ b/scripts/history_stagging.py
@@ -1,84 +1,17 @@
 import logging
 from collections.abc import Callable
 from pathlib import Path
 import pandas as pd
-from pydantic import BaseModel
+
 from scripts.flux import consume_fluxes
 from .flux import ExcelSource, Flux
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
-class Source(BaseModel):
+def extract_cat(cat: pd.DataFrame):
    filename: str
    sheet_name: str = ""
 class Flux(BaseModel):
    sources: list[Source]
    transformation: Callable
    extra_kwrds: dict = {}
 def to_csv(df, dest_basename):
    dest = dest_basename.parent / (dest_basename.name + ".csv")
    if dest.exists():
        df.to_csv(dest, mode="a", header=False, index=False)
    else:
        df.to_csv(dest, index=False)
    return dest
 def write_split_by(
    df: pd.DataFrame, column: str, dest_path: Path, writing_func
 ) -> list[Path]:
    wrote_files = []
    for col_value in df[column].unique():
        filtered_df = df[df[column] == col_value]
        dest_basename = dest_path / f"{col_value}"
        dest = writing_func(filtered_df, dest_basename)
        wrote_files.append(dest)
    return wrote_files
 def extract_sources(sources: list[Source], base_path: Path = Path()):
    for src in sources:
        filepath = base_path / src.filename
        assert filepath.exists
        yield src.filename, pd.read_excel(filepath, sheet_name=src.sheet_name)
 def split_duplicates(
    df, origin: str, duplicated: dict[str, pd.DataFrame]
 ) -> [pd.DataFrame, dict[str, pd.DataFrame]]:
    duplicates = df.duplicated()
    no_duplicates = df[~duplicates]
    duplicated[origin] = df[duplicates]
    return no_duplicates, duplicated
 def crg(history_path: Path, staging_path: Path, metadatas: dict, writing_func=to_csv):
    duplicated = {}
    wrote_files = []
    for name, metadata in metadatas.items():
        logger.debug(f"Processing {name}")
        src_df = []
        for filename, df in extract_sources(metadata.sources, history_path):
            df, duplicated = split_duplicates(df, str(filename), duplicated)
            src_df.append(df)
        df = metadata.transformation(src_df, **metadata.extra_kwrds)
        files = write_split_by(df, "Année", staging_path, writing_func)
        wrote_files += files
    return wrote_files
 def extract_cat(cat):
    cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"]["Anciennes"])
    cat_trans = cat[cat["Nouvelles"] != "NE PAS IMPORTER"]
@@ -89,7 +22,9 @@ def extract_cat(cat):
    return trans, cat_drop
-def trans_2017_2021(dfs, **kwrds):
+def trans_2017_2021(
    dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
 ) -> pd.DataFrame:
    df, cat = dfs
    cat_trans, cat_drop = extract_cat(cat)
@@ -107,10 +42,12 @@ def trans_2017_2021(dfs, **kwrds):
        Fournisseur="",
    )
-    return df
+    return df[stagging_columns]
-def trans_2022_charge(dfs, **kwrds):
+def trans_2022_charge(
    dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
 ) -> pd.DataFrame:
    df = dfs[0]
    df = df.assign(
        Immeuble=df["immeuble"],
@@ -126,10 +63,12 @@ def trans_2022_charge(dfs, **kwrds):
        Régie="Oralia - Gelas",
        Libellé="",
    )
-    return df
+    return df[stagging_columns]
-def trans_2022_loc(dfs, **kwrds):
+def trans_2022_loc(
    dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
 ) -> pd.DataFrame:
    df = dfs[0]
    df = df.assign(
        Immeuble=df["immeuble"],
@@ -144,10 +83,12 @@ def trans_2022_loc(dfs, **kwrds):
        Régie="Oralia - Gelas",
        Libellé="",
    )
-    return df
+    return df[stagging_columns]
-def trans_2023(dfs, year, **kwrds):
+def trans_2023(
    dfs: list[pd.DataFrame], year: str, stagging_columns: list[str], **kwrds
 ) -> pd.DataFrame:
    df = dfs[0]
    df = df.assign(
        Débit=df["Débit"].fillna(0),
@@ -155,43 +96,60 @@ def trans_2023(dfs, year, **kwrds):
        Lot=df["Immeuble"].astype(str) + df["Porte"].astype("str").str.zfill(2),
        Année=year,
    )
-    return df
+    return df[stagging_columns]
-METADATAS = {
+STAGGING_COLUMNS = [
    "Régie",
    "Immeuble",
    "Porte",
    "Lot",
    "Année",
    "Mois",
    "Catégorie",
    "Fournisseur",
    "Libellé",
    "Débit",
    "Crédit",
 ]
 FLUXES = {
    "2017 2021 - charge et locataire.xlsx": Flux(
        sources=[
-            Source(
+            ExcelSource(
                filename="2017 2021 - charge et locataire.xlsx", sheet_name="DB CRG"
            ),
-            Source(
+            ExcelSource(
                filename="2017 2021 - charge et locataire.xlsx",
                sheet_name="Catégories",
            ),
        ],
        transformation=trans_2017_2021,
        extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
    ),
    "2022 - charge.xlsx": Flux(
        sources=[
-            Source(filename="2022 - charge.xlsx", sheet_name="Sheet1"),
+            ExcelSource(filename="2022 - charge.xlsx", sheet_name="Sheet1"),
        ],
        transformation=trans_2022_charge,
        extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
    ),
    "2022 - locataire.xlsx": Flux(
        sources=[
-            Source(filename="2022 - locataire.xlsx", sheet_name="Sheet1"),
+            ExcelSource(filename="2022 - locataire.xlsx", sheet_name="Sheet1"),
        ],
        transformation=trans_2022_loc,
        extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
    ),
    "2023 - charge et locataire.xlsx": Flux(
        sources=[
-            Source(
+            ExcelSource(
                filename="2023 - charge et locataire.xlsx",
                sheet_name="DB CRG 2023 ...",
            ),
        ],
        transformation=trans_2023,
-        extra_kwrds={"year": 2023},
+        extra_kwrds={"year": 2023, "stagging_columns": STAGGING_COLUMNS},
    ),
 }
@@ -208,5 +166,5 @@ if __name__ == "__main__":
    staging_crg_path = staging_path / "CRG"
    assert staging_crg_path.exists()
-    crg_files = crg(history_crg_path, staging_crg_path, METADATAS)
+    crg_files = consume_fluxes(FLUXES, history_crg_path, staging_crg_path)
    print(crg_files)
--- a/scripts/stagging_gold.py
+++ b/scripts/stagging_gold.py
@@ -0,0 +1,69 @@
 import logging
 from collections.abc import Callable
 from pathlib import Path
 import pandas as pd
 from scripts.flux import CSVSource, Flux, consume_fluxes
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 def feature_crg(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    df = dfs[0]
    df = df.assign(
        Impact=df["Crédit"] - df["Débit"],
    )
    return df
 GOLD_COLUMNS = [
    "Régie",
    "Immeuble",
    "Porte",
    "Lot",
    "Année",
    "Mois",
    "Catégorie",
    "Fournisseur",
    "Libellé",
    "Débit",
    "Crédit",
    "Impact",
 ]
 def build_crg_fluxes(
    crg_path: Path, pattern: str, transformation: Callable, csv_options: dict = {}
 ) -> dict[str, Flux]:
    fluxes = {}
    for crg in crg_path.glob(pattern):
        fluxes[f"CRG - {crg}"] = Flux(
            sources=[CSVSource(filename=crg.name, options=csv_options)],
            transformation=transformation,
        )
    return fluxes
 if __name__ == "__main__":
    data_path = Path("datas/")
    assert data_path.exists()
    staging_path = data_path / "staging"
    assert staging_path.exists()
    staging_crg_path = staging_path / "CRG"
    assert staging_crg_path.exists()
    gold_path = data_path / "gold"
    assert gold_path.exists()
    gold_crg_path = gold_path / "CRG"
    assert gold_crg_path.exists()
    fluxes = build_crg_fluxes(
        crg_path=staging_crg_path, pattern="*.csv", transformation=feature_crg
    )
    crg_files = consume_fluxes(
        fluxes=fluxes, origin_path=staging_crg_path, dest_path=gold_crg_path
    )
    print(crg_files)