Mapytex/mapytex/stat/weightedDataset.py

195 lines
5.1 KiB
Python
Raw Normal View History

2019-10-13 19:01:31 +00:00
# /usr/bin/env python
2016-01-08 13:58:21 +00:00
# -*- coding:Utf-8 -*-
"""
Statistical tools which should ease statistical exercises creation
"""
2016-01-08 13:58:21 +00:00
2016-01-09 09:34:46 +00:00
from math import sqrt, ceil
from collections import Counter
from .dataset import Dataset
from itertools import chain
2016-01-09 15:40:02 +00:00
from .number_tools import number_factory
2016-01-09 09:34:46 +00:00
2019-10-13 19:01:31 +00:00
def flatten_list(l):
return list(chain(*l))
2016-01-09 09:34:46 +00:00
2019-10-13 19:01:31 +00:00
2016-01-09 09:34:46 +00:00
class WeightedDataset(dict):
""" A weighted dataset with statistics and latex rendering methods
2016-02-13 03:29:26 +00:00
2016-01-09 09:34:46 +00:00
>>> w = WeightedDataset([1, 2, 3, 4], "Enfants", [10, 11, 12, 13])
>>> print(w)
{1: 10, 2: 11, 3: 12, 4: 13}
>>> w.effectif_total()
46
>>> w.sum()
120
>>> w.mean()
2016-01-09 15:40:02 +00:00
2.61
2016-01-09 09:34:46 +00:00
>>> w.deviation()
2016-01-09 15:40:02 +00:00
56.96
2016-01-09 09:34:46 +00:00
>>> w.variance()
2016-01-09 15:40:02 +00:00
1.24
2016-01-09 09:34:46 +00:00
>>> w.sd()
2016-01-09 15:40:02 +00:00
1.11
2016-02-13 03:29:26 +00:00
2016-01-09 09:34:46 +00:00
"""
2016-02-13 03:29:26 +00:00
2016-02-13 04:04:08 +00:00
def __init__(
2019-10-13 19:01:31 +00:00
self, datas=[], data_name="Valeurs", weights=[], weight_name="Effectifs"
):
2016-02-13 03:29:26 +00:00
"""
Initiate the WeightedDataset
2016-01-08 13:58:21 +00:00
"""
2016-01-09 09:34:46 +00:00
if datas and not weights:
weightedDatas = Counter(datas)
elif datas and weights:
if len(datas) != len(weights):
raise ValueError("Datas and weights should have same length")
else:
2016-02-13 04:04:08 +00:00
weightedDatas = {i[0]: i[1] for i in zip(datas, weights)}
2016-01-08 13:58:21 +00:00
2016-01-09 09:34:46 +00:00
dict.__init__(self, weightedDatas)
self.data_name = data_name
self.weight_name = weight_name
2016-02-13 03:29:26 +00:00
2016-02-13 04:04:08 +00:00
def add_data(self, data, weight=1):
2016-01-09 09:34:46 +00:00
try:
self[data] += weight
except KeyError:
self[data] = weight
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-09 09:34:46 +00:00
def total_weight(self):
return sum(self.values())
2016-01-08 13:58:21 +00:00
2016-01-09 09:34:46 +00:00
def effectif_total(self):
return self.total_weight()
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-09 09:34:46 +00:00
def sum(self):
""" Not really a sum but the sum of the product of key and values """
2016-02-13 04:04:08 +00:00
return sum([k * v for (k, v) in self.items()])
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-09 09:34:46 +00:00
def mean(self):
2016-02-13 04:04:08 +00:00
return self.sum() / self.effectif_total()
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-09 09:34:46 +00:00
def deviation(self):
""" Compute the deviation (not normalized) """
mean = self.mean()
2019-10-13 19:01:31 +00:00
return sum([v * (k - mean) ** 2 for (k, v) in self.items()])
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-08 13:58:21 +00:00
def variance(self):
2016-02-13 04:04:08 +00:00
return self.deviation() / self.effectif_total()
2016-02-13 03:29:26 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-09 09:34:46 +00:00
def sd(self):
""" Compute the standard deviation """
2016-01-08 13:58:21 +00:00
return sqrt(self.variance())
def quartiles(self):
"""
Calcul les quartiles de la série.
:return: un tuple avec (min, Q1, Me, Q3, Max)
2016-01-09 09:34:46 +00:00
>>> w = WeightedDataset(flatten_list([i*[i] for i in range(5)]))
>>> w.quartiles()
2016-01-09 15:40:02 +00:00
(1, 2, 3, 4, 4)
2016-01-09 09:34:46 +00:00
>>> w = WeightedDataset(flatten_list([i*[i] for i in range(6)]))
>>> w.quartiles()
(1, 3, 4, 5, 5)
2016-01-08 13:58:21 +00:00
"""
2019-10-13 19:01:31 +00:00
return (
min(self.keys()),
self.quartile(1),
self.quartile(2),
self.quartile(3),
max(self.keys()),
)
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-02-13 04:04:08 +00:00
def quartile(self, quartile=1):
2016-01-08 13:58:21 +00:00
"""
Calcul un quartile de la série.
:param quartile: quartile à calculer (par defaut 1 -> Q1)
:return: le quartile demandé
: Example:
2016-01-09 09:34:46 +00:00
>>> w = WeightedDataset(flatten_list([i*[i] for i in range(5)]))
>>> w.quartile(1)
2
>>> w.quartile(2)
2016-01-09 15:40:02 +00:00
3
2016-01-09 09:34:46 +00:00
>>> w.quartile(3)
4
>>> w = WeightedDataset(flatten_list([i*[i] for i in range(6)]))
>>> w.quartile(1)
3
>>> w.quartile(2)
4
>>> w.quartile(3)
5
"""
# -1 to match with list indexing
position = self.posi_quartile(quartile) - 1
2016-02-13 04:04:08 +00:00
expanded_values = flatten_list([v * [k] for (k, v) in self.items()])
2016-01-09 09:34:46 +00:00
if position.is_integer():
2019-10-13 19:01:31 +00:00
return (
expanded_values[int(position)] + expanded_values[int(position) + 1]
) / 2
2016-01-09 09:34:46 +00:00
else:
return expanded_values[ceil(position)]
2016-01-08 13:58:21 +00:00
2016-02-13 04:04:08 +00:00
def posi_quartile(self, quartile=1):
2016-02-13 03:29:26 +00:00
"""
2016-01-08 13:58:21 +00:00
Calcul la position du quartile
:param quartile: le quartile concerné
2016-02-13 03:29:26 +00:00
2016-01-08 13:58:21 +00:00
:return : la position du quartile (arondis à l'entier suppérieur, non arrondis)
"""
2016-01-09 09:34:46 +00:00
return quartile * self.effectif_total() / 4
2016-02-13 03:29:26 +00:00
2016-01-08 13:58:21 +00:00
# --------------------------
# Rendu latex
2016-02-13 03:29:26 +00:00
2016-01-08 13:58:21 +00:00
def tabular_latex(self):
2016-01-09 10:19:44 +00:00
""" Latex code to display dataset as a tabular """
2016-02-13 04:04:08 +00:00
latex = "\\begin{{tabular}}{{|c|*{{{nbr_col}}}{{c|}}}} \n".format(
2019-10-13 19:01:31 +00:00
nbr_col=len(self.keys())
)
2016-01-09 10:19:44 +00:00
latex += "\t \hline \n"
2016-02-13 04:04:08 +00:00
data_line = "\t {data_name} ".format(data_name=self.data_name)
weight_line = "\t {weight_name} ".format(weight_name=self.weight_name)
2016-01-08 13:58:21 +00:00
2016-02-13 04:04:08 +00:00
# TODO: Il faudra trouver une solution pour le formatage des données
# |sam. janv. 9 13:14:26 EAT 2016
for (v, e) in self.items():
data_line += "& {val} ".format(val=v)
weight_line += "& {eff} ".format(eff=e)
2016-01-08 13:58:21 +00:00
2016-01-09 10:19:44 +00:00
latex += data_line + "\\\\ \n \t \\hline \n"
latex += weight_line + "\\\\ \n \t \\hline \n"
latex += "\\end{tabular}"
2016-01-08 13:58:21 +00:00
return latex
# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
2016-02-13 03:29:26 +00:00
# cursor: 16 del