Feat: convert stagging2gold

This commit is contained in:
Bertrand Benjamin 2024-03-03 06:39:27 +01:00
parent 9e5541a770
commit 25ede1789a
3 changed files with 224 additions and 88 deletions

109
scripts/flux.py Normal file
View File

@ -0,0 +1,109 @@
import logging
from abc import abstractmethod
from collections.abc import Callable
from pathlib import Path
import pandas as pd
from pydantic import BaseModel
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class Source(BaseModel):
filename: str
@abstractmethod
def get_df(self) -> pd.DataFrame:
raise NotImplementedError
class ExcelSource(Source):
sheet_name: str
def get_df(self, base_path: Path) -> pd.DataFrame:
filepath = base_path / self.filename
logger.debug(f"Get content of {filepath}")
return pd.read_excel(filepath, sheet_name=self.sheet_name)
class CSVSource(Source):
options: dict
def get_df(self, base_path: Path) -> pd.DataFrame:
filepath = base_path / self.filename
logger.debug(f"Get content of {filepath}")
return pd.read_csv(filepath, **self.options)
class Flux(BaseModel):
sources: list[Source]
transformation: Callable
extra_kwrds: dict = {}
def to_csv(df, dest_basename):
dest = dest_basename.parent / (dest_basename.name + ".csv")
if dest.exists():
df.to_csv(dest, mode="a", header=False, index=False)
else:
df.to_csv(dest, index=False)
return dest
def write_split_by(
df: pd.DataFrame, column: str, dest_path: Path, writing_func
) -> list[Path]:
wrote_files = []
for col_value in df[column].unique():
filtered_df = df[df[column] == col_value]
dest_basename = dest_path / f"{col_value}"
dest = writing_func(filtered_df, dest_basename)
wrote_files.append(dest)
return wrote_files
def extract_sources(sources: list[Source], base_path: Path = Path()):
for src in sources:
if "*" in src.filename:
expanded_src = [
src.model_copy(update={"filename": p}) for p in Path.glob(src.filename)
]
yield from extract_sources(expanded_src, base_path)
else:
filepath = base_path / src.filename
assert filepath.exists
yield src.filename, src.get_df(base_path)
def split_duplicates(
df, origin: str, duplicated: dict[str, pd.DataFrame]
) -> [pd.DataFrame, dict[str, pd.DataFrame]]:
duplicates = df.duplicated()
no_duplicates = df[~duplicates]
duplicated[origin] = df[duplicates]
return no_duplicates, duplicated
def consume_fluxes(
fluxes: dict[str, Flux], origin_path: Path, dest_path: Path, writing_func=to_csv
):
duplicated = {}
wrote_files = []
for name, flux in fluxes.items():
print(name)
logger.info(f"Processing flux {name}")
src_df = []
for filename, df in extract_sources(flux.sources, origin_path):
df, duplicated = split_duplicates(df, str(filename), duplicated)
src_df.append(df)
df = flux.transformation(src_df, **flux.extra_kwrds)
files = write_split_by(df, "Année", dest_path, writing_func)
wrote_files += files
return wrote_files

View File

@ -1,84 +1,17 @@
import logging import logging
from collections.abc import Callable
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
from pydantic import BaseModel
from scripts.flux import consume_fluxes
from .flux import ExcelSource, Flux
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
class Source(BaseModel): def extract_cat(cat: pd.DataFrame):
filename: str
sheet_name: str = ""
class Flux(BaseModel):
sources: list[Source]
transformation: Callable
extra_kwrds: dict = {}
def to_csv(df, dest_basename):
dest = dest_basename.parent / (dest_basename.name + ".csv")
if dest.exists():
df.to_csv(dest, mode="a", header=False, index=False)
else:
df.to_csv(dest, index=False)
return dest
def write_split_by(
df: pd.DataFrame, column: str, dest_path: Path, writing_func
) -> list[Path]:
wrote_files = []
for col_value in df[column].unique():
filtered_df = df[df[column] == col_value]
dest_basename = dest_path / f"{col_value}"
dest = writing_func(filtered_df, dest_basename)
wrote_files.append(dest)
return wrote_files
def extract_sources(sources: list[Source], base_path: Path = Path()):
for src in sources:
filepath = base_path / src.filename
assert filepath.exists
yield src.filename, pd.read_excel(filepath, sheet_name=src.sheet_name)
def split_duplicates(
df, origin: str, duplicated: dict[str, pd.DataFrame]
) -> [pd.DataFrame, dict[str, pd.DataFrame]]:
duplicates = df.duplicated()
no_duplicates = df[~duplicates]
duplicated[origin] = df[duplicates]
return no_duplicates, duplicated
def crg(history_path: Path, staging_path: Path, metadatas: dict, writing_func=to_csv):
duplicated = {}
wrote_files = []
for name, metadata in metadatas.items():
logger.debug(f"Processing {name}")
src_df = []
for filename, df in extract_sources(metadata.sources, history_path):
df, duplicated = split_duplicates(df, str(filename), duplicated)
src_df.append(df)
df = metadata.transformation(src_df, **metadata.extra_kwrds)
files = write_split_by(df, "Année", staging_path, writing_func)
wrote_files += files
return wrote_files
def extract_cat(cat):
cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"]["Anciennes"]) cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"]["Anciennes"])
cat_trans = cat[cat["Nouvelles"] != "NE PAS IMPORTER"] cat_trans = cat[cat["Nouvelles"] != "NE PAS IMPORTER"]
@ -89,7 +22,9 @@ def extract_cat(cat):
return trans, cat_drop return trans, cat_drop
def trans_2017_2021(dfs, **kwrds): def trans_2017_2021(
dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
) -> pd.DataFrame:
df, cat = dfs df, cat = dfs
cat_trans, cat_drop = extract_cat(cat) cat_trans, cat_drop = extract_cat(cat)
@ -107,10 +42,12 @@ def trans_2017_2021(dfs, **kwrds):
Fournisseur="", Fournisseur="",
) )
return df return df[stagging_columns]
def trans_2022_charge(dfs, **kwrds): def trans_2022_charge(
dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
) -> pd.DataFrame:
df = dfs[0] df = dfs[0]
df = df.assign( df = df.assign(
Immeuble=df["immeuble"], Immeuble=df["immeuble"],
@ -126,10 +63,12 @@ def trans_2022_charge(dfs, **kwrds):
Régie="Oralia - Gelas", Régie="Oralia - Gelas",
Libellé="", Libellé="",
) )
return df return df[stagging_columns]
def trans_2022_loc(dfs, **kwrds): def trans_2022_loc(
dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
) -> pd.DataFrame:
df = dfs[0] df = dfs[0]
df = df.assign( df = df.assign(
Immeuble=df["immeuble"], Immeuble=df["immeuble"],
@ -144,10 +83,12 @@ def trans_2022_loc(dfs, **kwrds):
Régie="Oralia - Gelas", Régie="Oralia - Gelas",
Libellé="", Libellé="",
) )
return df return df[stagging_columns]
def trans_2023(dfs, year, **kwrds): def trans_2023(
dfs: list[pd.DataFrame], year: str, stagging_columns: list[str], **kwrds
) -> pd.DataFrame:
df = dfs[0] df = dfs[0]
df = df.assign( df = df.assign(
Débit=df["Débit"].fillna(0), Débit=df["Débit"].fillna(0),
@ -155,43 +96,60 @@ def trans_2023(dfs, year, **kwrds):
Lot=df["Immeuble"].astype(str) + df["Porte"].astype("str").str.zfill(2), Lot=df["Immeuble"].astype(str) + df["Porte"].astype("str").str.zfill(2),
Année=year, Année=year,
) )
return df return df[stagging_columns]
METADATAS = { STAGGING_COLUMNS = [
"Régie",
"Immeuble",
"Porte",
"Lot",
"Année",
"Mois",
"Catégorie",
"Fournisseur",
"Libellé",
"Débit",
"Crédit",
]
FLUXES = {
"2017 2021 - charge et locataire.xlsx": Flux( "2017 2021 - charge et locataire.xlsx": Flux(
sources=[ sources=[
Source( ExcelSource(
filename="2017 2021 - charge et locataire.xlsx", sheet_name="DB CRG" filename="2017 2021 - charge et locataire.xlsx", sheet_name="DB CRG"
), ),
Source( ExcelSource(
filename="2017 2021 - charge et locataire.xlsx", filename="2017 2021 - charge et locataire.xlsx",
sheet_name="Catégories", sheet_name="Catégories",
), ),
], ],
transformation=trans_2017_2021, transformation=trans_2017_2021,
extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
), ),
"2022 - charge.xlsx": Flux( "2022 - charge.xlsx": Flux(
sources=[ sources=[
Source(filename="2022 - charge.xlsx", sheet_name="Sheet1"), ExcelSource(filename="2022 - charge.xlsx", sheet_name="Sheet1"),
], ],
transformation=trans_2022_charge, transformation=trans_2022_charge,
extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
), ),
"2022 - locataire.xlsx": Flux( "2022 - locataire.xlsx": Flux(
sources=[ sources=[
Source(filename="2022 - locataire.xlsx", sheet_name="Sheet1"), ExcelSource(filename="2022 - locataire.xlsx", sheet_name="Sheet1"),
], ],
transformation=trans_2022_loc, transformation=trans_2022_loc,
extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
), ),
"2023 - charge et locataire.xlsx": Flux( "2023 - charge et locataire.xlsx": Flux(
sources=[ sources=[
Source( ExcelSource(
filename="2023 - charge et locataire.xlsx", filename="2023 - charge et locataire.xlsx",
sheet_name="DB CRG 2023 ...", sheet_name="DB CRG 2023 ...",
), ),
], ],
transformation=trans_2023, transformation=trans_2023,
extra_kwrds={"year": 2023}, extra_kwrds={"year": 2023, "stagging_columns": STAGGING_COLUMNS},
), ),
} }
@ -208,5 +166,5 @@ if __name__ == "__main__":
staging_crg_path = staging_path / "CRG" staging_crg_path = staging_path / "CRG"
assert staging_crg_path.exists() assert staging_crg_path.exists()
crg_files = crg(history_crg_path, staging_crg_path, METADATAS) crg_files = consume_fluxes(FLUXES, history_crg_path, staging_crg_path)
print(crg_files) print(crg_files)

69
scripts/stagging_gold.py Normal file
View File

@ -0,0 +1,69 @@
import logging
from collections.abc import Callable
from pathlib import Path
import pandas as pd
from scripts.flux import CSVSource, Flux, consume_fluxes
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def feature_crg(dfs: list[pd.DataFrame]) -> pd.DataFrame:
df = dfs[0]
df = df.assign(
Impact=df["Crédit"] - df["Débit"],
)
return df
GOLD_COLUMNS = [
"Régie",
"Immeuble",
"Porte",
"Lot",
"Année",
"Mois",
"Catégorie",
"Fournisseur",
"Libellé",
"Débit",
"Crédit",
"Impact",
]
def build_crg_fluxes(
crg_path: Path, pattern: str, transformation: Callable, csv_options: dict = {}
) -> dict[str, Flux]:
fluxes = {}
for crg in crg_path.glob(pattern):
fluxes[f"CRG - {crg}"] = Flux(
sources=[CSVSource(filename=crg.name, options=csv_options)],
transformation=transformation,
)
return fluxes
if __name__ == "__main__":
data_path = Path("datas/")
assert data_path.exists()
staging_path = data_path / "staging"
assert staging_path.exists()
staging_crg_path = staging_path / "CRG"
assert staging_crg_path.exists()
gold_path = data_path / "gold"
assert gold_path.exists()
gold_crg_path = gold_path / "CRG"
assert gold_crg_path.exists()
fluxes = build_crg_fluxes(
crg_path=staging_crg_path, pattern="*.csv", transformation=feature_crg
)
crg_files = consume_fluxes(
fluxes=fluxes, origin_path=staging_crg_path, dest_path=gold_crg_path
)
print(crg_files)