Feat: convert stagging2gold
This commit is contained in:
parent
9e5541a770
commit
25ede1789a
109
scripts/flux.py
Normal file
109
scripts/flux.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import logging
|
||||||
|
from abc import abstractmethod
|
||||||
|
from collections.abc import Callable
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
class Source(BaseModel):
|
||||||
|
filename: str
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_df(self) -> pd.DataFrame:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class ExcelSource(Source):
|
||||||
|
sheet_name: str
|
||||||
|
|
||||||
|
def get_df(self, base_path: Path) -> pd.DataFrame:
|
||||||
|
filepath = base_path / self.filename
|
||||||
|
logger.debug(f"Get content of {filepath}")
|
||||||
|
return pd.read_excel(filepath, sheet_name=self.sheet_name)
|
||||||
|
|
||||||
|
|
||||||
|
class CSVSource(Source):
|
||||||
|
options: dict
|
||||||
|
|
||||||
|
def get_df(self, base_path: Path) -> pd.DataFrame:
|
||||||
|
filepath = base_path / self.filename
|
||||||
|
logger.debug(f"Get content of {filepath}")
|
||||||
|
return pd.read_csv(filepath, **self.options)
|
||||||
|
|
||||||
|
|
||||||
|
class Flux(BaseModel):
|
||||||
|
sources: list[Source]
|
||||||
|
transformation: Callable
|
||||||
|
extra_kwrds: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
def to_csv(df, dest_basename):
|
||||||
|
dest = dest_basename.parent / (dest_basename.name + ".csv")
|
||||||
|
if dest.exists():
|
||||||
|
df.to_csv(dest, mode="a", header=False, index=False)
|
||||||
|
else:
|
||||||
|
df.to_csv(dest, index=False)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
|
||||||
|
def write_split_by(
|
||||||
|
df: pd.DataFrame, column: str, dest_path: Path, writing_func
|
||||||
|
) -> list[Path]:
|
||||||
|
wrote_files = []
|
||||||
|
|
||||||
|
for col_value in df[column].unique():
|
||||||
|
filtered_df = df[df[column] == col_value]
|
||||||
|
|
||||||
|
dest_basename = dest_path / f"{col_value}"
|
||||||
|
dest = writing_func(filtered_df, dest_basename)
|
||||||
|
wrote_files.append(dest)
|
||||||
|
|
||||||
|
return wrote_files
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sources(sources: list[Source], base_path: Path = Path()):
|
||||||
|
for src in sources:
|
||||||
|
if "*" in src.filename:
|
||||||
|
expanded_src = [
|
||||||
|
src.model_copy(update={"filename": p}) for p in Path.glob(src.filename)
|
||||||
|
]
|
||||||
|
yield from extract_sources(expanded_src, base_path)
|
||||||
|
else:
|
||||||
|
filepath = base_path / src.filename
|
||||||
|
assert filepath.exists
|
||||||
|
yield src.filename, src.get_df(base_path)
|
||||||
|
|
||||||
|
|
||||||
|
def split_duplicates(
|
||||||
|
df, origin: str, duplicated: dict[str, pd.DataFrame]
|
||||||
|
) -> [pd.DataFrame, dict[str, pd.DataFrame]]:
|
||||||
|
duplicates = df.duplicated()
|
||||||
|
no_duplicates = df[~duplicates]
|
||||||
|
duplicated[origin] = df[duplicates]
|
||||||
|
return no_duplicates, duplicated
|
||||||
|
|
||||||
|
|
||||||
|
def consume_fluxes(
|
||||||
|
fluxes: dict[str, Flux], origin_path: Path, dest_path: Path, writing_func=to_csv
|
||||||
|
):
|
||||||
|
duplicated = {}
|
||||||
|
wrote_files = []
|
||||||
|
|
||||||
|
for name, flux in fluxes.items():
|
||||||
|
print(name)
|
||||||
|
logger.info(f"Processing flux {name}")
|
||||||
|
src_df = []
|
||||||
|
for filename, df in extract_sources(flux.sources, origin_path):
|
||||||
|
df, duplicated = split_duplicates(df, str(filename), duplicated)
|
||||||
|
src_df.append(df)
|
||||||
|
|
||||||
|
df = flux.transformation(src_df, **flux.extra_kwrds)
|
||||||
|
|
||||||
|
files = write_split_by(df, "Année", dest_path, writing_func)
|
||||||
|
wrote_files += files
|
||||||
|
return wrote_files
|
@ -1,84 +1,17 @@
|
|||||||
import logging
|
import logging
|
||||||
from collections.abc import Callable
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pydantic import BaseModel
|
|
||||||
|
from scripts.flux import consume_fluxes
|
||||||
|
|
||||||
|
from .flux import ExcelSource, Flux
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
class Source(BaseModel):
|
def extract_cat(cat: pd.DataFrame):
|
||||||
filename: str
|
|
||||||
sheet_name: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
class Flux(BaseModel):
|
|
||||||
sources: list[Source]
|
|
||||||
transformation: Callable
|
|
||||||
extra_kwrds: dict = {}
|
|
||||||
|
|
||||||
|
|
||||||
def to_csv(df, dest_basename):
|
|
||||||
dest = dest_basename.parent / (dest_basename.name + ".csv")
|
|
||||||
if dest.exists():
|
|
||||||
df.to_csv(dest, mode="a", header=False, index=False)
|
|
||||||
else:
|
|
||||||
df.to_csv(dest, index=False)
|
|
||||||
return dest
|
|
||||||
|
|
||||||
|
|
||||||
def write_split_by(
|
|
||||||
df: pd.DataFrame, column: str, dest_path: Path, writing_func
|
|
||||||
) -> list[Path]:
|
|
||||||
wrote_files = []
|
|
||||||
|
|
||||||
for col_value in df[column].unique():
|
|
||||||
filtered_df = df[df[column] == col_value]
|
|
||||||
|
|
||||||
dest_basename = dest_path / f"{col_value}"
|
|
||||||
dest = writing_func(filtered_df, dest_basename)
|
|
||||||
wrote_files.append(dest)
|
|
||||||
|
|
||||||
return wrote_files
|
|
||||||
|
|
||||||
|
|
||||||
def extract_sources(sources: list[Source], base_path: Path = Path()):
|
|
||||||
for src in sources:
|
|
||||||
filepath = base_path / src.filename
|
|
||||||
assert filepath.exists
|
|
||||||
yield src.filename, pd.read_excel(filepath, sheet_name=src.sheet_name)
|
|
||||||
|
|
||||||
|
|
||||||
def split_duplicates(
|
|
||||||
df, origin: str, duplicated: dict[str, pd.DataFrame]
|
|
||||||
) -> [pd.DataFrame, dict[str, pd.DataFrame]]:
|
|
||||||
duplicates = df.duplicated()
|
|
||||||
no_duplicates = df[~duplicates]
|
|
||||||
duplicated[origin] = df[duplicates]
|
|
||||||
return no_duplicates, duplicated
|
|
||||||
|
|
||||||
|
|
||||||
def crg(history_path: Path, staging_path: Path, metadatas: dict, writing_func=to_csv):
|
|
||||||
duplicated = {}
|
|
||||||
wrote_files = []
|
|
||||||
|
|
||||||
for name, metadata in metadatas.items():
|
|
||||||
logger.debug(f"Processing {name}")
|
|
||||||
src_df = []
|
|
||||||
for filename, df in extract_sources(metadata.sources, history_path):
|
|
||||||
df, duplicated = split_duplicates(df, str(filename), duplicated)
|
|
||||||
src_df.append(df)
|
|
||||||
|
|
||||||
df = metadata.transformation(src_df, **metadata.extra_kwrds)
|
|
||||||
|
|
||||||
files = write_split_by(df, "Année", staging_path, writing_func)
|
|
||||||
wrote_files += files
|
|
||||||
return wrote_files
|
|
||||||
|
|
||||||
|
|
||||||
def extract_cat(cat):
|
|
||||||
cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"]["Anciennes"])
|
cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"]["Anciennes"])
|
||||||
cat_trans = cat[cat["Nouvelles"] != "NE PAS IMPORTER"]
|
cat_trans = cat[cat["Nouvelles"] != "NE PAS IMPORTER"]
|
||||||
|
|
||||||
@ -89,7 +22,9 @@ def extract_cat(cat):
|
|||||||
return trans, cat_drop
|
return trans, cat_drop
|
||||||
|
|
||||||
|
|
||||||
def trans_2017_2021(dfs, **kwrds):
|
def trans_2017_2021(
|
||||||
|
dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
|
||||||
|
) -> pd.DataFrame:
|
||||||
df, cat = dfs
|
df, cat = dfs
|
||||||
cat_trans, cat_drop = extract_cat(cat)
|
cat_trans, cat_drop = extract_cat(cat)
|
||||||
|
|
||||||
@ -107,10 +42,12 @@ def trans_2017_2021(dfs, **kwrds):
|
|||||||
Fournisseur="",
|
Fournisseur="",
|
||||||
)
|
)
|
||||||
|
|
||||||
return df
|
return df[stagging_columns]
|
||||||
|
|
||||||
|
|
||||||
def trans_2022_charge(dfs, **kwrds):
|
def trans_2022_charge(
|
||||||
|
dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
|
||||||
|
) -> pd.DataFrame:
|
||||||
df = dfs[0]
|
df = dfs[0]
|
||||||
df = df.assign(
|
df = df.assign(
|
||||||
Immeuble=df["immeuble"],
|
Immeuble=df["immeuble"],
|
||||||
@ -126,10 +63,12 @@ def trans_2022_charge(dfs, **kwrds):
|
|||||||
Régie="Oralia - Gelas",
|
Régie="Oralia - Gelas",
|
||||||
Libellé="",
|
Libellé="",
|
||||||
)
|
)
|
||||||
return df
|
return df[stagging_columns]
|
||||||
|
|
||||||
|
|
||||||
def trans_2022_loc(dfs, **kwrds):
|
def trans_2022_loc(
|
||||||
|
dfs: list[pd.DataFrame], stagging_columns: list[str], **kwrds
|
||||||
|
) -> pd.DataFrame:
|
||||||
df = dfs[0]
|
df = dfs[0]
|
||||||
df = df.assign(
|
df = df.assign(
|
||||||
Immeuble=df["immeuble"],
|
Immeuble=df["immeuble"],
|
||||||
@ -144,10 +83,12 @@ def trans_2022_loc(dfs, **kwrds):
|
|||||||
Régie="Oralia - Gelas",
|
Régie="Oralia - Gelas",
|
||||||
Libellé="",
|
Libellé="",
|
||||||
)
|
)
|
||||||
return df
|
return df[stagging_columns]
|
||||||
|
|
||||||
|
|
||||||
def trans_2023(dfs, year, **kwrds):
|
def trans_2023(
|
||||||
|
dfs: list[pd.DataFrame], year: str, stagging_columns: list[str], **kwrds
|
||||||
|
) -> pd.DataFrame:
|
||||||
df = dfs[0]
|
df = dfs[0]
|
||||||
df = df.assign(
|
df = df.assign(
|
||||||
Débit=df["Débit"].fillna(0),
|
Débit=df["Débit"].fillna(0),
|
||||||
@ -155,43 +96,60 @@ def trans_2023(dfs, year, **kwrds):
|
|||||||
Lot=df["Immeuble"].astype(str) + df["Porte"].astype("str").str.zfill(2),
|
Lot=df["Immeuble"].astype(str) + df["Porte"].astype("str").str.zfill(2),
|
||||||
Année=year,
|
Année=year,
|
||||||
)
|
)
|
||||||
return df
|
return df[stagging_columns]
|
||||||
|
|
||||||
|
|
||||||
METADATAS = {
|
STAGGING_COLUMNS = [
|
||||||
|
"Régie",
|
||||||
|
"Immeuble",
|
||||||
|
"Porte",
|
||||||
|
"Lot",
|
||||||
|
"Année",
|
||||||
|
"Mois",
|
||||||
|
"Catégorie",
|
||||||
|
"Fournisseur",
|
||||||
|
"Libellé",
|
||||||
|
"Débit",
|
||||||
|
"Crédit",
|
||||||
|
]
|
||||||
|
|
||||||
|
FLUXES = {
|
||||||
"2017 2021 - charge et locataire.xlsx": Flux(
|
"2017 2021 - charge et locataire.xlsx": Flux(
|
||||||
sources=[
|
sources=[
|
||||||
Source(
|
ExcelSource(
|
||||||
filename="2017 2021 - charge et locataire.xlsx", sheet_name="DB CRG"
|
filename="2017 2021 - charge et locataire.xlsx", sheet_name="DB CRG"
|
||||||
),
|
),
|
||||||
Source(
|
ExcelSource(
|
||||||
filename="2017 2021 - charge et locataire.xlsx",
|
filename="2017 2021 - charge et locataire.xlsx",
|
||||||
sheet_name="Catégories",
|
sheet_name="Catégories",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
transformation=trans_2017_2021,
|
transformation=trans_2017_2021,
|
||||||
|
extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
|
||||||
),
|
),
|
||||||
"2022 - charge.xlsx": Flux(
|
"2022 - charge.xlsx": Flux(
|
||||||
sources=[
|
sources=[
|
||||||
Source(filename="2022 - charge.xlsx", sheet_name="Sheet1"),
|
ExcelSource(filename="2022 - charge.xlsx", sheet_name="Sheet1"),
|
||||||
],
|
],
|
||||||
transformation=trans_2022_charge,
|
transformation=trans_2022_charge,
|
||||||
|
extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
|
||||||
),
|
),
|
||||||
"2022 - locataire.xlsx": Flux(
|
"2022 - locataire.xlsx": Flux(
|
||||||
sources=[
|
sources=[
|
||||||
Source(filename="2022 - locataire.xlsx", sheet_name="Sheet1"),
|
ExcelSource(filename="2022 - locataire.xlsx", sheet_name="Sheet1"),
|
||||||
],
|
],
|
||||||
transformation=trans_2022_loc,
|
transformation=trans_2022_loc,
|
||||||
|
extra_kwrds={"stagging_columns": STAGGING_COLUMNS},
|
||||||
),
|
),
|
||||||
"2023 - charge et locataire.xlsx": Flux(
|
"2023 - charge et locataire.xlsx": Flux(
|
||||||
sources=[
|
sources=[
|
||||||
Source(
|
ExcelSource(
|
||||||
filename="2023 - charge et locataire.xlsx",
|
filename="2023 - charge et locataire.xlsx",
|
||||||
sheet_name="DB CRG 2023 ...",
|
sheet_name="DB CRG 2023 ...",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
transformation=trans_2023,
|
transformation=trans_2023,
|
||||||
extra_kwrds={"year": 2023},
|
extra_kwrds={"year": 2023, "stagging_columns": STAGGING_COLUMNS},
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,5 +166,5 @@ if __name__ == "__main__":
|
|||||||
staging_crg_path = staging_path / "CRG"
|
staging_crg_path = staging_path / "CRG"
|
||||||
assert staging_crg_path.exists()
|
assert staging_crg_path.exists()
|
||||||
|
|
||||||
crg_files = crg(history_crg_path, staging_crg_path, METADATAS)
|
crg_files = consume_fluxes(FLUXES, history_crg_path, staging_crg_path)
|
||||||
print(crg_files)
|
print(crg_files)
|
||||||
|
69
scripts/stagging_gold.py
Normal file
69
scripts/stagging_gold.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
import logging
|
||||||
|
from collections.abc import Callable
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from scripts.flux import CSVSource, Flux, consume_fluxes
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
def feature_crg(dfs: list[pd.DataFrame]) -> pd.DataFrame:
|
||||||
|
df = dfs[0]
|
||||||
|
df = df.assign(
|
||||||
|
Impact=df["Crédit"] - df["Débit"],
|
||||||
|
)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
GOLD_COLUMNS = [
|
||||||
|
"Régie",
|
||||||
|
"Immeuble",
|
||||||
|
"Porte",
|
||||||
|
"Lot",
|
||||||
|
"Année",
|
||||||
|
"Mois",
|
||||||
|
"Catégorie",
|
||||||
|
"Fournisseur",
|
||||||
|
"Libellé",
|
||||||
|
"Débit",
|
||||||
|
"Crédit",
|
||||||
|
"Impact",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def build_crg_fluxes(
|
||||||
|
crg_path: Path, pattern: str, transformation: Callable, csv_options: dict = {}
|
||||||
|
) -> dict[str, Flux]:
|
||||||
|
fluxes = {}
|
||||||
|
for crg in crg_path.glob(pattern):
|
||||||
|
fluxes[f"CRG - {crg}"] = Flux(
|
||||||
|
sources=[CSVSource(filename=crg.name, options=csv_options)],
|
||||||
|
transformation=transformation,
|
||||||
|
)
|
||||||
|
return fluxes
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
data_path = Path("datas/")
|
||||||
|
assert data_path.exists()
|
||||||
|
|
||||||
|
staging_path = data_path / "staging"
|
||||||
|
assert staging_path.exists()
|
||||||
|
staging_crg_path = staging_path / "CRG"
|
||||||
|
assert staging_crg_path.exists()
|
||||||
|
|
||||||
|
gold_path = data_path / "gold"
|
||||||
|
assert gold_path.exists()
|
||||||
|
gold_crg_path = gold_path / "CRG"
|
||||||
|
assert gold_crg_path.exists()
|
||||||
|
|
||||||
|
fluxes = build_crg_fluxes(
|
||||||
|
crg_path=staging_crg_path, pattern="*.csv", transformation=feature_crg
|
||||||
|
)
|
||||||
|
crg_files = consume_fluxes(
|
||||||
|
fluxes=fluxes, origin_path=staging_crg_path, dest_path=gold_crg_path
|
||||||
|
)
|
||||||
|
print(crg_files)
|
Loading…
Reference in New Issue
Block a user