Feat: csv extraction and flattening

This commit is contained in:
Bertrand Benjamin 2019-08-04 21:57:27 +02:00
parent 9358c10b47
commit 2296615cb4
3 changed files with 247 additions and 24 deletions

View File

@ -0,0 +1,15 @@
#!/usr/bin/env python
# encoding: utf-8
NO_STUDENT_COLUMNS = [
"Trimestre",
"Nom",
"Date",
"Exercice",
"Question",
"Competence",
"Domaine",
"Commentaire",
"Bareme",
"Niveau",
]

View File

@ -0,0 +1,75 @@
#!/usr/bin/env python
# encoding: utf-8
""" Extracting data from xlsx files """
import pandas as pd
from . import NO_STUDENT_COLUMNS
pd.set_option("Precision", 2)
def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
""" Extract the list of students from df
:param df: the dataframe
:param no_student_columns: columns that are not students
:return: list of students
"""
students = df.columns.difference(no_student_columns)
return students
def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
""" Flat the ws for students
:param df: the dataframe (one row per questions)
:param no_student_columns: columns that are not students
:return: dataframe with one row per questions and students
Columns of csv files:
- NO_STUDENT_COLUMNS
- one for each students
This function flat student's columns to "student" and "score"
"""
students = extract_students(df, no_student_columns)
scores = []
for st in students:
scores.append(
pd.melt(
df,
id_vars=no_student_columns,
value_vars=st,
var_name="student",
value_name="score",
)
)
return pd.concat(scores)
def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
""" Flat and clear the dataframe extracted from csv
:param csv_df: data frame read from csv
:param no_student_columns: columns that are not students
:return: dataframe with one row per questions and students
"""
df = flat_df_students(csv_df)
df.columns = df.columns.map(lambda x: x.lower())
df["question"].fillna("", inplace=True)
df["exercice"].fillna("", inplace=True)
df["commentaire"].fillna("", inplace=True)
df["competence"].fillna("", inplace=True)
return df
# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -10,12 +10,13 @@
"from IPython.display import DisplayHandle\n", "from IPython.display import DisplayHandle\n",
"import pandas as pd\n", "import pandas as pd\n",
"from pathlib import Path\n", "from pathlib import Path\n",
"from datetime import datetime" "from datetime import datetime\n",
"from recopytex.csv_extraction import flat_clear_csv"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 2,
"metadata": { "metadata": {
"tags": [ "tags": [
"parameters" "parameters"
@ -24,35 +25,20 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"tribe = \"308\"\n", "tribe = \"308\"\n",
"assessment = \"161114_dm2\"\n", "assessment = \"DM1\"\n",
"csv_file = Path(f\"./sheets/{tribe}/{assessment}.csv\")" "date = \"15/09/16\"\n",
"csv_file = Path(f\"../sheets/{tribe}/160915_{assessment}.csv\")"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"split_ass = assessment.split(\"_\")\n",
"if len(split_ass) > 1:\n",
" date, *assessment = assessment.split(\"_\")\n",
" date = datetime.strptime(date, \"%y%m%d\")\n",
" assessment = ' '.join(assessment)\n",
"else:\n",
" date = None\n",
" assessment = split_ass[0]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/markdown": [ "text/markdown": [
"# dm2 (14/11/2016) pour 308" "# DM1 (15/09/16) pour 308"
], ],
"text/plain": [ "text/plain": [
"<IPython.core.display.Markdown object>" "<IPython.core.display.Markdown object>"
@ -66,7 +52,154 @@
"if date is None:\n", "if date is None:\n",
" display(md(f\"# {assessment} pour {tribe}\"))\n", " display(md(f\"# {assessment} pour {tribe}\"))\n",
"else:\n", "else:\n",
" display(md(f\"# {assessment} ({date:%d/%m/%Y}) pour {tribe}\"))" " display(md(f\"# {assessment} ({date}) pour {tribe}\"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>trimestre</th>\n",
" <th>nom</th>\n",
" <th>date</th>\n",
" <th>exercice</th>\n",
" <th>question</th>\n",
" <th>competence</th>\n",
" <th>domaine</th>\n",
" <th>commentaire</th>\n",
" <th>bareme</th>\n",
" <th>niveau</th>\n",
" <th>student</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>DM1</td>\n",
" <td>15/09/16</td>\n",
" <td>1</td>\n",
" <td>1.1</td>\n",
" <td>Cal</td>\n",
" <td>Prio</td>\n",
" <td></td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>ABDOU Asmahane</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>DM1</td>\n",
" <td>15/09/16</td>\n",
" <td>1</td>\n",
" <td>1.2</td>\n",
" <td>Cal</td>\n",
" <td>Prio</td>\n",
" <td></td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>ABDOU Asmahane</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>DM1</td>\n",
" <td>15/09/16</td>\n",
" <td>1</td>\n",
" <td>1.3</td>\n",
" <td>Cal</td>\n",
" <td>Prio</td>\n",
" <td></td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>ABDOU Asmahane</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>DM1</td>\n",
" <td>15/09/16</td>\n",
" <td>1</td>\n",
" <td>1.4</td>\n",
" <td>Cal</td>\n",
" <td>Prio</td>\n",
" <td></td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>ABDOU Asmahane</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>DM1</td>\n",
" <td>15/09/16</td>\n",
" <td>1</td>\n",
" <td>1.5</td>\n",
" <td>Cal</td>\n",
" <td>Prio</td>\n",
" <td></td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>ABDOU Asmahane</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" trimestre nom date exercice question competence domaine commentaire \\\n",
"0 1 DM1 15/09/16 1 1.1 Cal Prio \n",
"1 1 DM1 15/09/16 1 1.2 Cal Prio \n",
"2 1 DM1 15/09/16 1 1.3 Cal Prio \n",
"3 1 DM1 15/09/16 1 1.4 Cal Prio \n",
"4 1 DM1 15/09/16 1 1.5 Cal Prio \n",
"\n",
" bareme niveau student score \n",
"0 1.0 1 ABDOU Asmahane 2 \n",
"1 1.0 1 ABDOU Asmahane 3 \n",
"2 1.0 1 ABDOU Asmahane 2 \n",
"3 1.0 1 ABDOU Asmahane 2 \n",
"4 1.0 1 ABDOU Asmahane 2 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stack_scores = pd.read_csv(csv_file)\n",
"scores = flat_clear_csv(stack_scores)\n",
"scores.head()"
] ]
}, },
{ {