Feat: csv extraction and flattening

2019-08-04 21:57:27 +02:00
parent 9358c10b47
commit 2296615cb4
3 changed files with 247 additions and 24 deletions
--- a/recopytex/init.py
+++ b/recopytex/init.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+NO_STUDENT_COLUMNS = [
+    "Trimestre",
+    "Nom",
+    "Date",
+    "Exercice",
+    "Question",
+    "Competence",
+    "Domaine",
+    "Commentaire",
+    "Bareme",
+    "Niveau",
+]
--- a/recopytex/csv_extraction.py
+++ b/recopytex/csv_extraction.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+""" Extracting data from xlsx files """
+
+import pandas as pd
+from . import NO_STUDENT_COLUMNS
+
+pd.set_option("Precision", 2)
+
+
+def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
+    """ Extract the list of students from df 
+
+    :param df: the dataframe
+    :param no_student_columns: columns that are not students
+    :return: list of students
+    """
+    students = df.columns.difference(no_student_columns)
+    return students
+
+
+def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
+    """ Flat the ws for students
+
+    :param df: the dataframe (one row per questions)
+    :param no_student_columns: columns that are not students
+    :return: dataframe with one row per questions and students
+
+    Columns of csv files:
+
+    - NO_STUDENT_COLUMNS
+    - one for each students
+
+    This function flat student's columns to "student" and "score"
+    """
+    students = extract_students(df, no_student_columns)
+    scores = []
+    for st in students:
+        scores.append(
+            pd.melt(
+                df,
+                id_vars=no_student_columns,
+                value_vars=st,
+                var_name="student",
+                value_name="score",
+            )
+        )
+    return pd.concat(scores)
+
+
+def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
+    """ Flat and clear the dataframe extracted from csv
+
+    :param csv_df: data frame read from csv
+    :param no_student_columns: columns that are not students
+    :return: dataframe with one row per questions and students
+
+    """
+    df = flat_df_students(csv_df)
+
+    df.columns = df.columns.map(lambda x: x.lower())
+
+    df["question"].fillna("", inplace=True)
+    df["exercice"].fillna("", inplace=True)
+    df["commentaire"].fillna("", inplace=True)
+    df["competence"].fillna("", inplace=True)
+
+    return df
+
+
+# -----------------------------
+# Reglages pour 'vim'
+# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
+# cursor: 16 del