Feat: first analysis and plots

This commit is contained in:
Bertrand Benjamin 2019-08-04 23:24:32 +02:00
parent 2296615cb4
commit 358af2aabf
5 changed files with 561 additions and 125 deletions

View File

@ -1,15 +1,5 @@
#!/usr/bin/env python
# encoding: utf-8
NO_STUDENT_COLUMNS = [
"Trimestre",
"Nom",
"Date",
"Exercice",
"Question",
"Competence",
"Domaine",
"Commentaire",
"Bareme",
"Niveau",
]
from .csv_extraction import flat_clear_csv
from .df_marks_manip import pp_q_scores

30
recopytex/config.py Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env python
# encoding: utf-8
NO_ST_COLUMNS = {
"term": "Trimestre",
"assessment": "Nom",
"date": "Date",
"exercise": "Exercice",
"question": "Question",
"competence": "Competence",
"theme": "Domaine",
"comment": "Commentaire",
"score_rate": "Bareme",
"is_leveled": "Est_nivele",
}
COLUMNS = {
**NO_ST_COLUMNS,
"student": "Eleve",
"score": "Score",
"mark": "Note",
"level": "Niveau",
"normalized": "Normalise",
}
VALIDSCORE = {
"NOTFILLED": "", # The item is not scored yet
"NOANSWER": ".", # Student gives no answer (this score will impact the fianl mark)
"ABS": "a", # Student has absent (this score won't be impact the final mark)
}

View File

@ -4,12 +4,12 @@
""" Extracting data from xlsx files """
import pandas as pd
from . import NO_STUDENT_COLUMNS
from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE
pd.set_option("Precision", 2)
def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
""" Extract the list of students from df
:param df: the dataframe
@ -20,7 +20,7 @@ def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
return students
def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
def flat_df_students(df, no_student_columns=NO_ST_COLUMNS.values()):
""" Flat the ws for students
:param df: the dataframe (one row per questions)
@ -29,7 +29,7 @@ def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
Columns of csv files:
- NO_STUDENT_COLUMNS
- NO_ST_COLUMNS meta data on questions
- one for each students
This function flat student's columns to "student" and "score"
@ -42,14 +42,14 @@ def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
df,
id_vars=no_student_columns,
value_vars=st,
var_name="student",
value_name="score",
var_name=COLUMNS["student"],
value_name=COLUMNS["score"],
)
)
return pd.concat(scores)
return pd.concat(scores).dropna(subset=[COLUMNS["score"]])
def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
def flat_clear_csv(csv_df, no_student_columns=NO_ST_COLUMNS.values()):
""" Flat and clear the dataframe extracted from csv
:param csv_df: data frame read from csv
@ -59,12 +59,16 @@ def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
"""
df = flat_df_students(csv_df)
df.columns = df.columns.map(lambda x: x.lower())
df[COLUMNS["question"]].fillna("", inplace=True)
df[COLUMNS["exercise"]].fillna("", inplace=True)
df[COLUMNS["comment"]].fillna("", inplace=True)
df[COLUMNS["competence"]].fillna("", inplace=True)
df["question"].fillna("", inplace=True)
df["exercice"].fillna("", inplace=True)
df["commentaire"].fillna("", inplace=True)
df["competence"].fillna("", inplace=True)
df[COLUMNS["score"]] = pd.to_numeric(
df[COLUMNS["score"]]
.replace(VALIDSCORE["NOANSWER"], -1)
.apply(lambda x: str(x).replace(",", "."))
)
return df

205
recopytex/df_marks_manip.py Normal file
View File

@ -0,0 +1,205 @@
#!/usr/bin/env python
# encoding: utf-8
import pandas as pd
import numpy as np
from math import ceil, floor
from .config import COLUMNS, VALIDSCORE
# Values manipulations
def round_half_point(val):
try:
return 0.5 * ceil(2.0 * val)
except ValueError:
return val
except TypeError:
return val
def score_to_mark(x):
""" Compute the mark
if the item is leveled then the score is multiply by the score_rate
otherwise it copies the score
:param x: dictionnary with COLUMNS["is_leveled"], COLUMNS["score"] and COLUMNS["score_rate"] keys
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
... COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> score_to_mark(df.loc[0])
1.0
>>> score_to_mark(df.loc[10])
1.3333333333333333
"""
# -1 is no answer
if x[COLUMNS["score"]] == -1:
return 0
if x[COLUMNS["is_leveled"]]:
if x[COLUMNS["score"]] not in [0, 1, 2, 3]:
raise ValueError(f"The evaluation is out of range: {x[COLUMNS['score']]} at {x}")
return x[COLUMNS["score"]] * x[COLUMNS["score_rate"]] / 3
if x[COLUMNS["score"]] > x[COLUMNS["score_rate"]]:
raise ValueError(
f"The score ({x['score']}) is greated than the rating scale ({x[COLUMNS['score_rate']]}) at {x}"
)
return x[COLUMNS["score"]]
def score_to_level(x):
""" Compute the level (".",0,1,2,3).
:param x: dictionnary with COLUMNS["is_leveled"], COLUMNS["score"] and COLUMNS["score_rate"] keys
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
... COLUMNS["score"]:[1, 0.33, np.nan, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> score_to_level(df.loc[0])
3
>>> score_to_level(df.loc[1])
1
>>> score_to_level(df.loc[2])
'na'
>>> score_to_level(df.loc[3])
3
>>> score_to_level(df.loc[5])
3
>>> score_to_level(df.loc[10])
2
"""
# -1 is no answer
if x[COLUMNS["score"]] == -1:
return x[COLUMNS["score"]]
if x[COLUMNS["is_leveled"]]:
return int(x[COLUMNS["score"]])
else:
return int(ceil(x[COLUMNS["score"]] / x[COLUMNS["score_rate"]] * 3))
# DataFrame columns manipulations
def compute_mark(df):
""" Add Mark column to df
:param df: DataFrame with COLUMNS["score"], COLUMNS["is_leveled"] and COLUMNS["score_rate"] columns.
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
... COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> compute_mark(df)
0 1.00
1 0.33
2 2.00
3 1.50
4 0.67
5 2.00
6 0.67
7 1.00
8 1.50
9 1.00
10 1.33
11 2.00
dtype: float64
"""
return df[[COLUMNS["score"], COLUMNS["is_leveled"], COLUMNS["score_rate"]]].apply(
score_to_mark, axis=1
)
def compute_level(df):
""" Add Mark column to df
:param df: DataFrame with COLUMNS["score"], COLUMNS["is_leveled"] and COLUMNS["score_rate"] columns.
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
... COLUMNS["score"]:[np.nan, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> compute_level(df)
0 na
1 1
2 3
3 3
4 1
5 3
6 2
7 3
8 3
9 2
10 2
11 3
dtype: object
"""
return df[[COLUMNS["score"], COLUMNS["is_leveled"], COLUMNS["score_rate"]]].apply(
score_to_level, axis=1
)
def compute_normalized(df):
""" Compute the normalized mark (Mark / score_rate)
:param df: DataFrame with "Mark" and COLUMNS["score_rate"] columns
>>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
... COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
... COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
... COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3, 0.666, 1, 1.5, 1, 2, 3],
... }
>>> df = pd.DataFrame(d)
>>> df["Mark"] = compute_marks(df)
>>> compute_normalized(df)
0 1.00
1 0.33
2 1.00
3 0.75
4 0.33
5 1.00
6 0.67
7 1.00
8 0.75
9 0.50
10 0.67
11 1.00
dtype: float64
"""
return df[COLUMNS["mark"]] / df[COLUMNS["score_rate"]]
# Postprocessing question scores
def pp_q_scores(df):
""" Postprocessing questions scores dataframe
:param df: questions-scores dataframe
:return: same data frame with mark, level and normalize columns
"""
assign = {
COLUMNS["mark"]: compute_mark,
COLUMNS["level"]: compute_level,
COLUMNS["normalized"]: compute_normalized,
}
return df.assign(**assign)
# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del

File diff suppressed because one or more lines are too long