recopytex/recopytex/csv_extraction.py

91 lines
2.5 KiB
Python
Raw Normal View History

2019-08-04 19:57:27 +00:00
#!/usr/bin/env python
# encoding: utf-8
""" Extracting data from xlsx files """
import pandas as pd
2019-08-04 21:24:32 +00:00
from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE
2019-08-04 19:57:27 +00:00
pd.set_option("Precision", 2)
2019-08-06 05:02:07 +00:00
def try_replace(x, old, new):
try:
return str(x).replace(old, new)
except ValueError:
return x
2019-08-04 19:57:27 +00:00
2019-08-04 21:24:32 +00:00
def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
2019-08-04 19:57:27 +00:00
""" Extract the list of students from df
:param df: the dataframe
:param no_student_columns: columns that are not students
:return: list of students
"""
students = df.columns.difference(no_student_columns)
return students
2019-08-04 21:24:32 +00:00
def flat_df_students(df, no_student_columns=NO_ST_COLUMNS.values()):
2019-08-04 19:57:27 +00:00
""" Flat the ws for students
:param df: the dataframe (one row per questions)
:param no_student_columns: columns that are not students
:return: dataframe with one row per questions and students
Columns of csv files:
2019-08-04 21:24:32 +00:00
- NO_ST_COLUMNS meta data on questions
2019-08-04 19:57:27 +00:00
- one for each students
This function flat student's columns to "student" and "score"
"""
students = extract_students(df, no_student_columns)
scores = []
for st in students:
scores.append(
pd.melt(
df,
id_vars=no_student_columns,
value_vars=st,
2019-08-04 21:24:32 +00:00
var_name=COLUMNS["student"],
value_name=COLUMNS["score"],
2019-08-06 05:02:07 +00:00
).dropna(subset=[COLUMNS["score"]])
2019-08-04 19:57:27 +00:00
)
2019-08-06 05:02:07 +00:00
return pd.concat(scores)
2019-08-04 19:57:27 +00:00
2019-08-04 21:24:32 +00:00
def flat_clear_csv(csv_df, no_student_columns=NO_ST_COLUMNS.values()):
2019-08-04 19:57:27 +00:00
""" Flat and clear the dataframe extracted from csv
:param csv_df: data frame read from csv
:param no_student_columns: columns that are not students
:return: dataframe with one row per questions and students
"""
df = flat_df_students(csv_df)
2019-08-04 21:24:32 +00:00
df[COLUMNS["question"]].fillna("", inplace=True)
df[COLUMNS["exercise"]].fillna("", inplace=True)
df[COLUMNS["comment"]].fillna("", inplace=True)
df[COLUMNS["competence"]].fillna("", inplace=True)
2019-08-04 19:57:27 +00:00
2019-08-04 21:24:32 +00:00
df[COLUMNS["score"]] = pd.to_numeric(
df[COLUMNS["score"]]
.replace(VALIDSCORE["NOANSWER"], -1)
2019-08-06 05:02:07 +00:00
.apply(lambda x: try_replace(x, ",", "."))
2019-08-04 21:24:32 +00:00
)
2019-08-04 21:32:22 +00:00
df[COLUMNS["score_rate"]] = pd.to_numeric(
df[COLUMNS["score_rate"]]
2019-08-06 05:02:07 +00:00
.apply(lambda x: try_replace(x, ",", ".")),
errors="coerce"
2019-08-04 21:32:22 +00:00
)
2019-08-04 19:57:27 +00:00
return df
# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del