Feat: première version en notebook

2024-02-17 05:28:42 +01:00
commit 7aca068a8d
6 changed files with 6328 additions and 0 deletions
--- a/10
+++ b/10
@@ -0,0 +1,10 @@
+clean_raw:
+	rm -rf ./PLESNA Compta SYSTEM/raw/**/*.csv
+
+clean_built:
+	rm -rf ./PLESNA Compta SYSTEM/staging/**/*.csv
+	rm -rf ./PLESNA Compta SYSTEM/gold/**/*.csv
+	rm -rf ./PLESNA Compta SYSTEM/datamart/**/*.csv
+
+clean_all: clean_built clean_raw
+
--- a/README.md
+++ b/README.md
@@ -0,0 +1,32 @@
+# E(T)LT pour Plesna
+
+## Stages
+
+- Raw: fichiers les plus brutes possibles
+
+  - historique (fichiers avant création datawharehouse)
+  - pdf (depuis les agences)
+  - banque (extract depuis le web)
+  - patrimoine
+
+- Staging: importation des données brutes triées par années
+
+  - locataire (fait)
+  - charge (fait)
+  - banque (fait)
+  - patrimoine (dimension): description des lots
+
+- Gold: données enrichies (catégorie, lots...) et testée
+
+  - locataire
+  - charge
+  - banque
+  - patrimoine
+
+- DataMart: données organisées pour analyse
+
+  - lot\_<lot_id>\_loyer: historique de tous les loyers du lot
+  - lot\_<lot_id>\_travaux: historique de tous les travaux du lot
+  - lot\_<lot_id>\_description: historique des identifications du lot
+
+  - pnl\_<annee>: agrégation des loyers, charges et banques par mois et immeuble
--- a/notebooks/auto_tagging.ipynb
+++ b/notebooks/auto_tagging.ipynb
--- a/notebooks/gold2mart.ipynb
+++ b/notebooks/gold2mart.ipynb
--- a/notebooks/histo2staging.ipynb
+++ b/notebooks/histo2staging.ipynb
--- a/notebooks/staging2gold.ipynb
+++ b/notebooks/staging2gold.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bc224455-95ed-4e33-864d-442396301cd4",
+   "metadata": {},
+   "source": [
+    "# Staging vers Gold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d5dff9f3-ec7d-4fc7-8471-5ed1fbf6cf06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "4e5779f6-e0ad-46f8-b684-49af4205f084",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "staging_path = Path(\"../PLESNA Compta SYSTEM/staging\")\n",
+    "assert staging_path.exists()\n",
+    "gold_path = Path(\"../PLESNA Compta SYSTEM/gold\")\n",
+    "assert gold_path.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2074af18-4f81-49cb-9d9c-f50e7408e7fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def to_csv(df, dest):\n",
+    "    if dest.exists():\n",
+    "        df.to_csv(dest, mode=\"a\", header=False, index=False)\n",
+    "    else:\n",
+    "        df.to_csv(dest, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cc74ba91-855a-41e7-8709-122425f98fb6",
+   "metadata": {},
+   "source": [
+    "### clean gold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "82de8bc5-8d1e-47fb-af28-076ed90835a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for f in gold_path.glob(\"**/*.csv\"):\n",
+    "    f.unlink()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "539446e1-835e-4d79-a8d8-ddd5823f30f9",
+   "metadata": {},
+   "source": [
+    "## CRG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a6423b7d-657f-4897-8dd3-fbca68318367",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2020.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2018.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2022.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2021.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2023.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2019.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2017.csv')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "crg_path = staging_path / \"CRG\"\n",
+    "assert crg_path.exists()\n",
+    "crg_files = list(crg_path.glob(\"*.csv\"))\n",
+    "print(crg_files)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "edcf15c4-aa3c-40c7-805d-ae8933decf8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "../PLESNA Compta SYSTEM/gold/CRG/2020.csv\n",
+      "../PLESNA Compta SYSTEM/gold/CRG/2018.csv\n",
+      "../PLESNA Compta SYSTEM/gold/CRG/2022.csv\n",
+      "../PLESNA Compta SYSTEM/gold/CRG/2021.csv\n",
+      "../PLESNA Compta SYSTEM/gold/CRG/2023.csv\n",
+      "../PLESNA Compta SYSTEM/gold/CRG/2019.csv\n",
+      "../PLESNA Compta SYSTEM/gold/CRG/2017.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "for f in crg_files:\n",
+    "    df = pd.read_csv(f)\n",
+    "    df = df.assign(\n",
+    "        Impact = df[\"Crédit\"] - df[\"Débit\"],\n",
+    "    )\n",
+    "    dest = gold_path / f\"CRG/{f.name}\"\n",
+    "    print(dest)\n",
+    "    to_csv(df, dest)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "811f6b89-be5a-4290-b3d5-466ec42eb3ae",
+   "metadata": {},
+   "source": [
+    "## Banque"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c017b0a4-8c41-482e-85b1-4a10be84270b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[PosixPath('../PLESNA Compta SYSTEM/staging/Banque/2020.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/Banque/2022.csv'), PosixPath('../PLESNA Compta SYSTEM/staging/Banque/2021.csv')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "banque_path = staging_path / \"Banque\"\n",
+    "assert banque_path.exists()\n",
+    "banque_files = list(banque_path.glob(\"*.csv\"))\n",
+    "print(banque_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b04b0d11-dd74-4463-bd6f-c59528cc080e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "../PLESNA Compta SYSTEM/gold/Banque/2020.csv\n",
+      "../PLESNA Compta SYSTEM/gold/Banque/2022.csv\n",
+      "../PLESNA Compta SYSTEM/gold/Banque/2021.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "for f in banque_files:\n",
+    "    df = pd.read_csv(f)\n",
+    "    df = df.assign(\n",
+    "        Impact = df[\"Crédit\"] - df[\"Débit\"],\n",
+    "    )\n",
+    "    dest = gold_path / f\"Banque/{f.name}\"\n",
+    "    print(dest)\n",
+    "    to_csv(df, dest)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}