Feat: write cli feature, datamart

Feat: start CLI
Feat: move assign Lot to featured
2024-03-05 19:20:33 +01:00 · 2024-03-05 19:00:45 +01:00 · 2024-03-05 19:00:17 +01:00 · 2024-03-05 18:59:55 +01:00 · 2024-03-05 18:59:01 +01:00 · 2024-03-04 20:09:20 +01:00
6 changed files with 134 additions and 29 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,4 @@ jupyter==1.0.0
 pandas==1.5.0
 pdf-oralia==0.3.11
 pydantic==2.6.1
 click==8.1.7
--- a/scripts/main.py
+++ b/scripts/main.py
@ -0,0 +1,86 @@
 import logging
 from logging.config import dictConfig
 from pathlib import Path
 import click
 from .flux import consume_fluxes
 DATA_PATH = Path("datas/")
 assert DATA_PATH.exists()
 HISTORY_PATH = DATA_PATH / "Histoire"
 assert HISTORY_PATH.exists()
 STAGING_PATH = DATA_PATH / "staging"
 assert STAGING_PATH.exists()
 GOLD_PATH = DATA_PATH / "gold"
 assert GOLD_PATH.exists()
 MART_PATH = DATA_PATH / "datamart"
 assert MART_PATH.exists()
@click.group()
@click.option("--debug/--no-debug", default=False)
 def main(debug):
    if debug:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.INFO
    logging_config = dict(
        version=1,
        formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
        handlers={
            "h": {
                "class": "logging.StreamHandler",
                "formatter": "f",
                "level": logging_level,
            }
        },
        root={
            "handlers": ["h"],
            "level": logging_level,
        },
    )
    dictConfig(logging_config)
@main.command()
 def ingest():
    from .history_stagging import FLUXES_CRG
    history_crg_path = HISTORY_PATH / "CRG"
    assert history_crg_path.exists()
    staging_crg_path = STAGING_PATH / "CRG"
    assert staging_crg_path.exists()
    consume_fluxes(
        fluxes=FLUXES_CRG,
        origin_path=history_crg_path,
        dest_path=staging_crg_path,
    )
@main.command()
 def feature():
    from .stagging_gold import FLUXES_CRG
    staging_crg_path = STAGING_PATH / "CRG"
    assert staging_crg_path.exists()
    gold_crg_path = GOLD_PATH / "CRG"
    assert gold_crg_path.exists()
    consume_fluxes(
        fluxes=FLUXES_CRG(staging_crg_path),
        origin_path=staging_crg_path,
        dest_path=gold_crg_path,
    )
@main.command()
 def datamart():
    from .gold_mart import FLUXES_LOT
    consume_fluxes(fluxes=FLUXES_LOT, origin_path=GOLD_PATH, dest_path=MART_PATH)
 if __name__ == "__main__":
    main()
--- a/scripts/flux.py
+++ b/scripts/flux.py
@ -6,9 +6,6 @@ from pathlib import Path
 import pandas as pd
 from pydantic import BaseModel
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 class Source(BaseModel):
    filename: str
@ -23,7 +20,7 @@ class ExcelSource(Source):
    def get_df(self, base_path: Path) -> pd.DataFrame:
        filepath = base_path / self.filename
-        logger.debug(f"Get content of {filepath}")
+        logging.debug(f"Get content of {filepath}")
        return pd.read_excel(filepath, sheet_name=self.sheet_name)
@ -32,7 +29,7 @@ class CSVSource(Source):
    def get_df(self, base_path: Path) -> pd.DataFrame:
        filepath = base_path / self.filename
-        logger.debug(f"Get content of {filepath}")
+        logging.debug(f"Get content of {filepath}")
        return pd.read_csv(filepath, **self.options)
@ -132,15 +129,16 @@ def consume_fluxes(
    wrote_files = []
    for name, flux in fluxes.items():
-        print(name)
+        logging.info(f"Consume {name}")
        logger.info(f"Processing flux {name}")
        src_df = []
        for filename, df in extract_sources(flux.sources, origin_path):
            logging.info(f"Extracting {filename}")
            df, duplicated = split_duplicates(df, str(filename), duplicated)
            src_df.append(df)
        logging.info(f"Execute {flux.transformation.function.__name__}")
        df = flux.transformation.function(src_df, **flux.transformation.extra_kwrds)
        files = flux.destination.write(df, dest_path, writing_func)
        logging.info(f"{files} written")
        wrote_files += files
    return wrote_files
--- a/scripts/gold_mart.py
+++ b/scripts/gold_mart.py
@ -18,13 +18,22 @@ logger.setLevel(logging.DEBUG)
 def build_lots(dfs: list[pd.DataFrame]) -> pd.DataFrame:
-    df = dfs[0]
+    df = pd.concat(dfs)
    df = df.assign(
        Impact=df["Crédit"] - df["Débit"],
    )
    return df
 FLUXES_LOT = {
    "Lots": Flux(
        sources=[CSVSource(filename="CRG/crg-*.csv")],
        transformation=Transformation(function=build_lots),
        destination=SplitDestination(name="Lot/lot", split_column="Lot"),
    ),
 }
 def build_pnl(dfs: list[pd.DataFrame], year: int) -> pd.DataFrame:
    df = pd.concat(dfs)
    df = df[df["Année"] == year]
@ -40,8 +49,10 @@ def build_pnl_flux(year: int) -> Flux:
            CSVSource(filename=f"CRG/crg-{year}.csv"),
            CSVSource(filename=f"banque/banque-{year}.csv"),
        ],
-        transformation=build_pnl,
+        transformation=Transformation(
-        extra_kwrds={"year": year},
+            function=build_pnl,
            extra_kwrds={"year": year},
        ),
        destination=Destination(name=f"pnl/{year}"),
    )
@ -58,18 +69,15 @@ if __name__ == "__main__":
    mart_path = data_path / "datamart"
    assert mart_path.exists()
-    lot_fluxes = {
+    files = consume_fluxes(
-        "Lots": Flux(
+        fluxes=FLUXES_LOT, origin_path=gold_path, dest_path=mart_path
-            sources=[CSVSource(filename="CRG/crg-*.csv")],
+    )
-            transformation=Transformation(function=build_lots),
+
            destination=SplitDestination(name="Lot/lot", split_column="Lot"),
        ),
    }
    years = list(range(2017, 2024))
    # pnl_fluxes = {f"pnl-{year}": build_pnl_flux(year) for year in years}
    pnl_fluxes = {}
    files = consume_fluxes(
-        fluxes={**lot_fluxes, **pnl_fluxes}, origin_path=gold_path, dest_path=mart_path
+        fluxes=pnl_fluxes, origin_path=gold_path, dest_path=mart_path
    )
    print(files)
--- a/scripts/history_stagging.py
+++ b/scripts/history_stagging.py
@ -22,6 +22,12 @@ def extract_cat(cat: pd.DataFrame):
    return trans, cat_drop
 def lot_naming(value):
    if str(value).isnumeric():
        return str(value).zfill(2)
    return "PC"
 def trans_2017_2021(
    dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
 ) -> pd.DataFrame:
@ -35,7 +41,7 @@ def trans_2017_2021(
        Porte=df["porte"],
        Débit=df["Débit"].fillna(0),
        Crédit=df["Crédit"].fillna(0),
-        Lot=df["immeuble"].astype(str) + df["porte"].astype("str").str.zfill(2),
+        Lot=df["porte"].apply(lot_naming),
        Année=df["Date"].astype(str).str.slice(0, 4),
        Mois=df["Date"].astype(str).str.slice(5, 7),
        Catégorie=df["Categorie"].replace(cat_trans),
@ -54,7 +60,7 @@ def trans_2022_charge(
        Porte=df["lot"],
        Débit=df["Débits"].fillna(0),
        Crédit=df["Crédits"].fillna(0),
-        Lot=df["immeuble"].astype(str)[0] + df["lot"].astype("str").str.zfill(2),
+        Lot=df["lot"].apply(lot_naming),
        Année=df["annee"],
        Mois=df["mois"],
        Catégorie=df["Catégorie Charge"],
@ -75,7 +81,7 @@ def trans_2022_loc(
        Porte=df["lot"],
        Débit=0,
        Crédit=df["Réglés"].fillna(0),
-        Lot=df["immeuble"].astype(str)[0] + df["lot"].astype("str").str.zfill(2),
+        Lot=df["lot"].apply(lot_naming),
        Année=df["annee"],
        Mois=df["mois"],
        Catégorie="Loyer Charge",
@ -93,7 +99,7 @@ def trans_2023(
    df = df.assign(
        Débit=df["Débit"].fillna(0),
        Crédit=df["Crédit"].fillna(0),
-        Lot=df["Immeuble"].astype(str) + df["Porte"].astype("str").str.zfill(2),
+        Lot=lot_naming(df["Porte"]),
        Année=year,
    )
    return df[stagging_columns]
@ -113,7 +119,7 @@ STAGGING_COLUMNS = [
    "Crédit",
 ]
-FLUXES = {
+FLUXES_CRG = {
    "2017 2021 - charge et locataire.xlsx": Flux(
        sources=[
            ExcelSource(
@ -179,7 +185,7 @@ if __name__ == "__main__":
    assert staging_crg_path.exists()
    crg_files = consume_fluxes(
-        fluxes=FLUXES,
+        fluxes=FLUXES_CRG,
        origin_path=history_crg_path,
        dest_path=staging_crg_path,
    )
--- a/scripts/stagging_gold.py
+++ b/scripts/stagging_gold.py
@ -14,6 +14,7 @@ def feature_crg(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    df = dfs[0]
    df = df.assign(
        Impact=df["Crédit"] - df["Débit"],
        Lot=df["Immeuble"].astype(str) + df["Lot"].astype("str"),
    )
    return df
@ -47,6 +48,12 @@ def build_crg_fluxes(
    return fluxes
 def FLUXES_CRG(staging_crg_path: Path):
    return build_crg_fluxes(
        crg_path=staging_crg_path, pattern="*.csv", transformation=feature_crg
    )
 if __name__ == "__main__":
    data_path = Path("datas/")
    assert data_path.exists()
@ -61,10 +68,9 @@ if __name__ == "__main__":
    gold_crg_path = gold_path / "CRG"
    assert gold_crg_path.exists()
    fluxes = build_crg_fluxes(
        crg_path=staging_crg_path, pattern="*.csv", transformation=feature_crg
    )
    crg_files = consume_fluxes(
-        fluxes=fluxes, origin_path=staging_crg_path, dest_path=gold_crg_path
+        fluxes=FLUXES_CRG(staging_crg_path),
        origin_path=staging_crg_path,
        dest_path=gold_crg_path,
    )
    print(crg_files)
Author	SHA1	Message	Date
Bertrand Benjamin	e2805f9af2	Feat: write cli feature, datamart	2024-03-05 19:20:33 +01:00
Bertrand Benjamin	98691d5531	Feat: start CLI	2024-03-05 19:00:45 +01:00
Bertrand Benjamin	c6932c364b	Feat: move assign Lot to featured	2024-03-05 19:00:17 +01:00
Bertrand Benjamin	05430196d0	Feat: add some print (before logging)	2024-03-05 18:59:55 +01:00
Bertrand Benjamin	78576270db	Feat: adapt to new models	2024-03-05 18:59:01 +01:00
Bertrand Benjamin	4cc9e7b038	Fix: lot transformation	2024-03-04 20:09:20 +01:00