2019-08-04 19:57:27 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# encoding: utf-8
|
|
|
|
|
|
|
|
""" Extracting data from xlsx files """
|
|
|
|
|
|
|
|
import pandas as pd
|
2019-08-04 21:24:32 +00:00
|
|
|
from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE
|
2019-08-04 19:57:27 +00:00
|
|
|
|
|
|
|
pd.set_option("Precision", 2)
|
|
|
|
|
2019-08-20 19:15:39 +00:00
|
|
|
|
2019-08-06 05:02:07 +00:00
|
|
|
def try_replace(x, old, new):
|
|
|
|
try:
|
|
|
|
return str(x).replace(old, new)
|
|
|
|
except ValueError:
|
|
|
|
return x
|
|
|
|
|
2019-08-04 19:57:27 +00:00
|
|
|
|
2019-08-04 21:24:32 +00:00
|
|
|
def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
|
2021-01-05 08:14:52 +00:00
|
|
|
"""Extract the list of students from df
|
2019-08-04 19:57:27 +00:00
|
|
|
|
|
|
|
:param df: the dataframe
|
|
|
|
:param no_student_columns: columns that are not students
|
|
|
|
:return: list of students
|
|
|
|
"""
|
|
|
|
students = df.columns.difference(no_student_columns)
|
|
|
|
return students
|
|
|
|
|
|
|
|
|
2019-08-20 19:15:39 +00:00
|
|
|
def flat_df_students(
|
|
|
|
df, no_student_columns=NO_ST_COLUMNS.values(), postprocessing=True
|
|
|
|
):
|
2021-01-05 08:14:52 +00:00
|
|
|
"""Flat the dataframe by returning a dataframe with on student on each line
|
2019-08-04 19:57:27 +00:00
|
|
|
|
|
|
|
:param df: the dataframe (one row per questions)
|
|
|
|
:param no_student_columns: columns that are not students
|
|
|
|
:return: dataframe with one row per questions and students
|
|
|
|
|
|
|
|
Columns of csv files:
|
|
|
|
|
2019-08-04 21:24:32 +00:00
|
|
|
- NO_ST_COLUMNS meta data on questions
|
2019-08-04 19:57:27 +00:00
|
|
|
- one for each students
|
|
|
|
|
|
|
|
This function flat student's columns to "student" and "score"
|
|
|
|
"""
|
|
|
|
students = extract_students(df, no_student_columns)
|
|
|
|
scores = []
|
|
|
|
for st in students:
|
|
|
|
scores.append(
|
|
|
|
pd.melt(
|
|
|
|
df,
|
|
|
|
id_vars=no_student_columns,
|
|
|
|
value_vars=st,
|
2019-08-04 21:24:32 +00:00
|
|
|
var_name=COLUMNS["student"],
|
|
|
|
value_name=COLUMNS["score"],
|
2019-08-06 05:02:07 +00:00
|
|
|
).dropna(subset=[COLUMNS["score"]])
|
2019-08-04 19:57:27 +00:00
|
|
|
)
|
2019-08-20 19:15:39 +00:00
|
|
|
if postprocessing:
|
|
|
|
return postprocess(pd.concat(scores))
|
2019-08-06 05:02:07 +00:00
|
|
|
return pd.concat(scores)
|
2019-08-04 19:57:27 +00:00
|
|
|
|
|
|
|
|
2019-08-20 19:15:39 +00:00
|
|
|
def flat_df_for(
|
|
|
|
df, student, no_student_columns=NO_ST_COLUMNS.values(), postprocessing=True
|
|
|
|
):
|
2021-01-05 08:14:52 +00:00
|
|
|
"""Extract the data only for one student
|
2019-08-04 19:57:27 +00:00
|
|
|
|
2019-08-20 19:15:39 +00:00
|
|
|
:param df: the dataframe (one row per questions)
|
2019-08-04 19:57:27 +00:00
|
|
|
:param no_student_columns: columns that are not students
|
|
|
|
:return: dataframe with one row per questions and students
|
|
|
|
|
2019-08-20 19:15:39 +00:00
|
|
|
Columns of csv files:
|
|
|
|
|
|
|
|
- NO_ST_COLUMNS meta data on questions
|
|
|
|
- one for each students
|
|
|
|
|
|
|
|
"""
|
|
|
|
students = extract_students(df, no_student_columns)
|
|
|
|
if student not in students:
|
|
|
|
raise KeyError("This student is not in the table")
|
|
|
|
st_df = df[list(no_student_columns) + [student]]
|
|
|
|
st_df = st_df.rename(columns={student: COLUMNS["score"]}).dropna(
|
|
|
|
subset=[COLUMNS["score"]]
|
|
|
|
)
|
|
|
|
if postprocessing:
|
|
|
|
return postprocess(st_df)
|
|
|
|
return st_df
|
|
|
|
|
|
|
|
|
|
|
|
def postprocess(df):
|
2021-01-05 08:14:52 +00:00
|
|
|
"""Postprocessing score dataframe
|
2019-08-20 19:15:39 +00:00
|
|
|
|
|
|
|
- Replace na with an empty string
|
|
|
|
- Replace "NOANSWER" with -1
|
|
|
|
- Turn commas number to dot numbers
|
2019-08-04 19:57:27 +00:00
|
|
|
"""
|
|
|
|
|
2019-08-04 21:24:32 +00:00
|
|
|
df[COLUMNS["question"]].fillna("", inplace=True)
|
|
|
|
df[COLUMNS["exercise"]].fillna("", inplace=True)
|
|
|
|
df[COLUMNS["comment"]].fillna("", inplace=True)
|
|
|
|
df[COLUMNS["competence"]].fillna("", inplace=True)
|
2019-08-04 19:57:27 +00:00
|
|
|
|
2019-08-04 21:24:32 +00:00
|
|
|
df[COLUMNS["score"]] = pd.to_numeric(
|
|
|
|
df[COLUMNS["score"]]
|
|
|
|
.replace(VALIDSCORE["NOANSWER"], -1)
|
2019-08-06 05:02:07 +00:00
|
|
|
.apply(lambda x: try_replace(x, ",", "."))
|
2019-08-04 21:24:32 +00:00
|
|
|
)
|
2019-08-04 21:32:22 +00:00
|
|
|
df[COLUMNS["score_rate"]] = pd.to_numeric(
|
2019-08-20 19:15:39 +00:00
|
|
|
df[COLUMNS["score_rate"]].apply(lambda x: try_replace(x, ",", ".")),
|
|
|
|
errors="coerce",
|
2019-08-04 21:32:22 +00:00
|
|
|
)
|
2019-08-04 19:57:27 +00:00
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
# -----------------------------
|
|
|
|
# Reglages pour 'vim'
|
|
|
|
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
|
|
|
|
# cursor: 16 del
|