Coverage for src/ensae_teaching_cs/data/datacpt.py: 93%

29 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-04-28 06:23 +0200

1""" 

2@file 

3@brief Data for competitions 

4""" 

5import os 

6import random 

7import pandas 

8from pyensae.datasource import download_data 

9from pyquickhelper.loghelper import noLOG 

10from pyquickhelper.filehelper.encryption import decrypt_stream 

11 

12 

13def data_cpt_ENSAE_2016_11(folder=".", fLOG=noLOG): 

14 """ 

15 Returns the data for the competition 

16 :epkg:`Python 2A ENSAE 2016`, 

17 located on github `ensae_competition_2016.zip 

18 <https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/ 

19 2016_ENSAE_2A/ensae_competition_2016.zip>`_. 

20 

21 @param folder where to download and unzip 

22 @param fLOG logging function 

23 @return 2 dataframes, one with X, Y, the others one with only X 

24 """ 

25 url = "https://github.com/sdpython/ensae_teaching_cs/raw/master/_doc/competitions/2016_ENSAE_2A/" 

26 file = "ensae_competition_2016.zip" 

27 files = download_data(file, url=url, whereTo=folder, fLOG=fLOG) 

28 df1 = pandas.read_csv([f for f in files if f.endswith("ensae_competition_train.txt")][0], 

29 header=[0, 1], sep="\t", index_col=0) 

30 df2 = pandas.read_csv([f for f in files if "test_X" in f][0], 

31 header=[0, 1], sep="\t", index_col=0) 

32 return df1, df2 

33 

34 

35def data_cpt_ENSAE_2016_11_blind_set(password): 

36 """ 

37 Returns the evaluation set for the competition 

38 :epkg:`Python 2A ENSAE 2016`. 

39 

40 @param fLOG logging function 

41 @return 2 dataframes, one with X, Y, the others one with only X 

42 

43 The competition is over. The password is ``xdameratxdamerat``. 

44 """ 

45 if password == "dummy": 

46 return [random.random() for i in range(7500)] 

47 else: 

48 name = os.path.join(os.path.dirname(__file__), 

49 "data_competition", "answers.bin") 

50 if not os.path.exists(name): 

51 raise FileNotFoundError(name) 

52 with open(name, "rb") as f: 

53 c = f.read() 

54 if password is None: 

55 raise ValueError("password cannot be None.") 

56 if not isinstance(password, bytes): 

57 password = bytes(password, "ascii") 

58 res = decrypt_stream(password, c) 

59 g = res.decode("ascii").replace("\r", "") 

60 s = g.split("\n") 

61 return [int(_) for _ in s if _]