plesna/scripts/stagging_gold.py

import logging
from collections.abc import Callable
from pathlib import Path

import pandas as pd

from scripts.flux import CSVSource, Destination, Flux, consume_fluxes

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


def feature_crg(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    df = dfs[0]
    df = df.assign(
        Impact=df["Crédit"] - df["Débit"],
    )
    return df


GOLD_COLUMNS = [
    "Régie",
    "Immeuble",
    "Porte",
    "Lot",
    "Année",
    "Mois",
    "Catégorie",
    "Fournisseur",
    "Libellé",
    "Débit",
    "Crédit",
    "Impact",
]


def build_crg_fluxes(
    crg_path: Path, pattern: str, transformation: Callable, csv_options: dict = {}
) -> dict[str, Flux]:
    fluxes = {}
    for file in crg_path.glob(pattern):
        fluxes[f"CRG - {file.name}"] = Flux(
            sources=[CSVSource(filename=file.name, options=csv_options)],
            transformation=transformation,
            destination=Destination(name=file.name),
        )
    return fluxes


if __name__ == "__main__":
    data_path = Path("datas/")
    assert data_path.exists()

    staging_path = data_path / "staging"
    assert staging_path.exists()
    staging_crg_path = staging_path / "CRG"
    assert staging_crg_path.exists()

    gold_path = data_path / "gold"
    assert gold_path.exists()
    gold_crg_path = gold_path / "CRG"
    assert gold_crg_path.exists()

    fluxes = build_crg_fluxes(
        crg_path=staging_crg_path, pattern="*.csv", transformation=feature_crg
    )
    crg_files = consume_fluxes(
        fluxes=fluxes, origin_path=staging_crg_path, dest_path=gold_crg_path
    )
    print(crg_files)