Coverage for src/papierstat/datasets/biased.py: 100%

18 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-30 06:49 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Jeux de données biaisés. 

5""" 

6import numpy 

7from scipy.special import expit # pylint: disable=E0611 

8import pandas 

9from sklearn.model_selection import train_test_split 

10 

11 

12__all__ = ['load_biased'] 

13 

14 

15def load_biased(N=250): 

16 """ 

17 Retourne un jeu de données biaisé. 

18 

19 :param N: number of observations 

20 :return: :epkg:`pandas:DataFrame` 

21 """ 

22 kids = numpy.array([0.6, 0.8, 0.9, 0.95, 1.]) 

23 

24 data = [] 

25 for _ in range(0, N): 

26 obs = dict( 

27 age=numpy.random.randint(18, 65), 

28 gender=numpy.random.randint(0, 2), 

29 kids=numpy.searchsorted(kids, numpy.random.random())) 

30 r = 1000 + ( 

31 expit(obs['age'] / 30.) * 

32 (1000 + 1000 * numpy.random.random())) 

33 r -= obs['kids'] * (400 + 100 * numpy.random.random()) 

34 r -= obs['gender'] * numpy.random.randint(0, 1) 

35 r = max(r, 1000) 

36 obs['R'] = r 

37 data.append(obs) 

38 return train_test_split(pandas.DataFrame(data))