repytex/Repytex/tools/df_marks_manip.py

502 lines
18 KiB
Python

#!/usr/bin/env python
# encoding: utf-8
import pandas as pd
import numpy as np
from math import ceil, floor
import logging
logger = logging.getLogger(__name__)
NOANSWER = "."
NORATED = ""
# Values manipulations
def round_half_point(val):
try:
return 0.5 * ceil(2.0 * val)
except ValueError:
return val
except TypeError:
return val
def num_format(num):
""" Tranform a number into an appropriate string """
try:
if int(num) == num:
return str(int(num))
except ValueError:
pass
return f"{num:.1f}".replace(".", ",")
latex_caract = ["\\NoRep", "\\RepZ", "\\RepU", "\\RepD", "\\RepT"]
def note_to_rep(x):
r""" Transform a Note to the latex caracter
:param x: dictionnary with "Niveau" and "Note" keys
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.67, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> note_to_rep(df.loc[0])
1.0
>>> note_to_rep(df.loc[4])
'\\RepU'
"""
if x["Niveau"]:
if x["Note"] == NOANSWER:
return latex_caract[0]
elif x["Note"] in range(4):
return latex_caract[int(x["Note"])+1]
return x["Note"]
def note_to_mark(x):
""" Compute the mark when it is a "Niveau" note
:param x: dictionnary with "Niveau", "Note" and "Bareme" keys
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> note_to_mark(df.loc[0])
1.0
>>> note_to_mark(df.loc[10])
1.3333333333333333
"""
if x["Niveau"]:
if x["Note"] == NOANSWER:
return 0
if x["Note"] not in [0, 1, 2, 3]:
raise ValueError(f"The evaluation is out of range: {x['Note']} at {x}")
return x["Note"] * x["Bareme"] / 3
if x["Note"] > x["Bareme"]:
logger.warning(f"The note ({x['Note']}) is greated than the rating scale ({x['Bareme']}) at {x}")
return x["Note"]
def note_to_level(x):
""" Compute the level ("na",0,1,2,3).
"na" correspond to "no answer"
:param x: dictionnary with "Niveau", "Note" and "Bareme" keys
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, np.nan, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> note_to_level(df.loc[0])
3
>>> note_to_level(df.loc[1])
1
>>> note_to_level(df.loc[2])
'na'
>>> note_to_level(df.loc[3])
3
>>> note_to_level(df.loc[5])
3
>>> note_to_level(df.loc[10])
2
"""
if x["Note"] == NOANSWER:
return "na"
if pd.isnull(x["Bareme"]) or x["Bareme"] == 0:
return "na"
if x["Niveau"]:
return int(x["Note"])
else:
return int(ceil(x["Note"] / x["Bareme"] * 3))
def mark_bareme_formater(row):
""" Create m/b string """
return f"{num_format(row['Mark'])} / {num_format(row['Bareme'])}"
def question_uniq_formater(row):
""" Create a kind of unique description of the question
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> question_uniq_formater(df.loc[0])
'Ex1 Q1'
>>> question_uniq_formater(df.loc[10])
'Ex1 Q1'
"""
ans = ""
try:
int(row['Exercice'])
except ValueError:
ans += str(row["Exercice"])
else:
ans += "Exo"+str(row["Exercice"])
ans += " "
try:
int(row["Question"])
except ValueError:
if not pd.isnull(row["Question"]):
ans += str(row["Question"])
else:
ans += "Qu"+str(row["Question"])
try:
row["Commentaire"]
except KeyError:
pass
else:
if not pd.isnull(row["Commentaire"]):
ans += " ({})".format(row["Commentaire"])
return ans
# DataFrame columns manipulations
def compute_marks(df):
""" Add Mark column to df
:param df: DataFrame with "Note", "Niveau" and "Bareme" columns.
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> compute_marks(df)
0 1.00
1 0.33
2 2.00
3 1.50
4 0.67
5 2.00
6 0.67
7 1.00
8 1.50
9 1.00
10 1.33
11 2.00
dtype: float64
"""
return df[["Note", "Niveau", "Bareme"]].apply(note_to_mark, axis=1)
def compute_level(df):
""" Add Mark column to df
:param df: DataFrame with "Note", "Niveau" and "Bareme" columns.
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[np.nan, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> compute_level(df)
0 na
1 1
2 3
3 3
4 1
5 3
6 2
7 3
8 3
9 2
10 2
11 3
dtype: object
"""
return df[["Note", "Niveau", "Bareme"]].apply(note_to_level, axis=1)
def compute_latex_rep(df):
""" Add Latex_rep column to df
:param df: DataFrame with "Note" and "Niveau" columns.
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> compute_latex_rep(df)
0 1
1 0.33
2 2
3 1.5
4 \RepU
5 \RepT
6 0.67
7 1
8 1.5
9 1
10 \RepD
11 \RepT
dtype: object
"""
return df[["Note", "Niveau"]].apply(note_to_rep, axis=1).fillna("??")
def compute_normalized(df):
""" Compute the normalized mark (Mark / Bareme)
:param df: DataFrame with "Mark" and "Bareme" columns
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> df["Mark"] = compute_marks(df)
>>> compute_normalized(df)
0 1.00
1 0.33
2 1.00
3 0.75
4 0.33
5 1.00
6 0.67
7 1.00
8 0.75
9 0.50
10 0.67
11 1.00
dtype: float64
"""
return df["Mark"] / df["Bareme"]
def compute_mark_barem(df):
""" Build the string mark m/b """
return df.apply(mark_bareme_formater, axis=1)
def compute_question_description(df):
""" Compute the unique description of a question """
return df.apply(question_uniq_formater, axis = 1)
# Computing custom values
def compute_exo_marks(df):
""" Compute Exercice level marks
:param df: the original marks
:returns: DataFrame with computed marks
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.67, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> df["Mark"] = compute_marks(df)
>>> compute_exo_marks(df)
Eleve Nom Exercice Date Trimestre Bareme Mark Question Niveau
0 E1 N1 Ex1 16/09/2016 1 2.0 1.5 Total 0
1 E1 N1 Ex2 16/09/2016 1 4.0 3.5 Total 0
2 E1 N2 Ex1 01/10/2016 1 2.0 1.0 Total 0
3 E1 N2 Ex2 01/10/2016 1 2.0 2.0 Total 0
4 E2 N1 Ex1 16/09/2016 1 2.0 2.0 Total 0
5 E2 N1 Ex2 16/09/2016 1 4.0 2.5 Total 0
6 E2 N2 Ex1 01/10/2016 1 2.0 1.5 Total 0
7 E2 N2 Ex2 01/10/2016 1 2.0 2.0 Total 0
"""
exo_pt = pd.pivot_table(df,
index = [ "Eleve", "Nom", "Exercice", "Date", "Trimestre"],
values = ["Bareme", "Mark"],
aggfunc=np.sum,
).applymap(round_half_point)
exo = exo_pt.reset_index()
exo["Question"] = "Total"
exo["Niveau"] = 0
return exo
def compute_eval_marks(df):
""" Compute Nom level marks from the dataframe using only row with Total in Question
:param df: DataFrame with value Total in Question column
:returns: DataFrame with evaluation marks
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, 0.67, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> df["Mark"] = compute_marks(df)
>>> df_exo = compute_exo_marks(df)
>>> compute_eval_marks(df_exo)
index Eleve Nom Trimestre Bareme Date Mark
0 0 E1 N1 1 6.0 16/09/2016 5.0
1 1 E2 N1 1 6.0 16/09/2016 4.5
2 0 E1 N2 1 4.0 01/10/2016 3.0
3 1 E2 N2 1 4.0 01/10/2016 3.5
"""
def date_format(dates):
date_l = list(dates.unique())
if len(date_l) == 1:
return date_l[0]
else:
return "Trimestre"
eval_m = pd.DataFrame()
for eval_name in df["Nom"].unique():
logger.debug(f"Compute marks for {eval_name}")
eval_df = df[df["Nom"] == eval_name]
dates = eval_df["Date"].unique()
logger.debug(f"Find those dates: {dates}")
if len(dates) > 1 or dates[0] == "Trimestre":
# Les devoirs sur la durée, les NaN ne sont pas pénalisants
# On les enlèves
eval_df = eval_df.dropna(subset=["Mark"])
dates = ["Trimestre"]
eval_pt = pd.pivot_table(eval_df,
index = [ "Eleve", "Nom", "Trimestre"],
values = ["Bareme", "Mark", "Date"],
aggfunc={"Bareme": np.sum, "Mark": np.sum, "Date":lambda x:dates[0]},
)
eval_pt = eval_pt.reset_index()
eval_m = pd.concat([eval_m, eval_pt])
eval_m = eval_m.reset_index()
return eval_m
def digest_flat_df(flat_df):
r""" Compute necessary element to make a flat df usable for analysis.
>>> from numpy import nan
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
... "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
... "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
... "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
... "Trimestre": ["1"]*12,
... "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
... "Note":[1, 0.33, 2, 1.5, 1, 3, np.nan, 0, 0, np.nan, np.nan, np.nan],
... }
>>> df = pd.DataFrame(d)
>>> quest_df, exo_df, eval_df = digest_flat_df(df)
>>> quest_df[['Eleve', "Nom", "Mark", "Latex_rep", "Normalized", "Uniq_quest", "Level"]]
Eleve Nom Mark Latex_rep Normalized Uniq_quest Level
0 E1 N1 1.00 1 1.00 Ex1 Q1 3
1 E1 N1 0.33 0.33 0.33 Ex1 Q2 1
2 E1 N1 2.00 2 1.00 Ex2 Q1 3
3 E1 N1 1.50 1.5 0.75 Ex2 Q2 3
4 E1 N2 0.67 \RepU 0.33 Ex1 Q1 1
5 E1 N2 2.00 \RepT 1.00 Ex2 Q1 3
6 E2 N1 NaN ?? NaN Ex1 Q1 na
7 E2 N1 0.00 0 0.00 Ex1 Q2 0
8 E2 N1 0.00 0 0.00 Ex2 Q1 0
9 E2 N1 NaN ?? NaN Ex2 Q2 na
10 E2 N2 NaN \NoRep NaN Ex1 Q1 na
11 E2 N2 NaN \NoRep NaN Ex2 Q1 na
>>> exo_df[['Eleve', "Nom", "Exercice", "Mark", "Normalized"]]
Eleve Nom Exercice Mark Normalized
0 E1 N1 Ex1 1.5 0.75
1 E1 N1 Ex2 3.5 0.88
2 E1 N2 Ex1 1.0 0.50
3 E1 N2 Ex2 2.0 1.00
4 E2 N1 Ex1 0.0 0.00
5 E2 N1 Ex2 0.0 0.00
6 E2 N2 Ex1 NaN NaN
7 E2 N2 Ex2 NaN NaN
>>> eval_df
index Eleve Nom Trimestre Bareme Date Mark Normalized
0 0 E1 N1 1 6.0 16/09/2016 5.0 0.83
1 1 E2 N1 1 6.0 16/09/2016 0.0 0.00
2 0 E1 N2 1 4.0 01/10/2016 3.0 0.75
3 1 E2 N2 1 4.0 01/10/2016 NaN NaN
"""
df = flat_df.dropna(subset=["Note"])
df["Mark"] = compute_marks(df)
df["Level"] = compute_level(df)
df["Latex_rep"] = compute_latex_rep(df)
df["Normalized"] = compute_normalized(df)
#df["Uniq_quest"] = compute_question_description(df)
exo_df = compute_exo_marks(df)
exo_df["Normalized"] = compute_normalized(exo_df)
exo_df["Mark_barem"] = compute_mark_barem(exo_df)
eval_df = compute_eval_marks(exo_df)
eval_df["Normalized"] = compute_normalized(eval_df)
eval_df["Mark_barem"] = compute_mark_barem(eval_df)
return df, exo_df, eval_df
# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del