Coverage for src/ensae_teaching_cs/homeblog/postclassification.py: 100%

13 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-04-28 06:23 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Helpers for blog classification 

5""" 

6 

7 

8privateKeyClassification = { 

9 "~recreative": ["\xE9conomie farfelue", "xaveir", "xavier", "xavier dupr\xE9", "litt\xE9rature", 

10 "green website", "restaurant", "alimentation", "cuisine", "emploi", "discussion", 

11 "wifi", "smart cities", "t\xE9l\xE9vision", "jeu", "jeux", "cin\xE9ma", 

12 "d\xE9couverte", "cheminement", "\xE9conomie", "d\xE9mocratie", "d\xE9mographie", 

13 "m\xE9decine", "th\xE9\xE2tre", "\xE9cole", "papa", "recreative", "video", "photo", "joke", 

14 "tennis"], 

15 "~technical": ["python", "programming", "c", "p-value", "edit distance", 

16 "latex", "vba", "javascript", "big data", "math\xE9matique", 

17 "programmation", "programmer", "internet", "algorithm", "algorithme", 

18 "extreme values", "C#", "c#", "c sharp", "csharp", "machine learning", "os", "r", "git", 

19 "doon\xE9es"], 

20 "~ENSAE": ["ensae alumni", "data scientist", "ensae", "ENSAE", "enseignement", ], 

21} 

22 

23privateKeyClassificationMandatory = list(privateKeyClassification.keys()) 

24 

25 

26def classify_post(keywords, content): 

27 """ 

28 returns a list of keywords as a classification 

29 - technical 

30 - recreative 

31 - English 

32 - French 

33 """ 

34 available_classes = list(privateKeyClassification.keys()) 

35 clean_keywords = [_.lower() 

36 for _ in keywords if _ not in available_classes] 

37 

38 # adds keywords in lower caase 

39 key = privateKeyClassification 

40 

41 res_class = [] 

42 for _ in clean_keywords: 

43 for k, v in key.items(): 

44 if _ in v: 

45 res_class.append(k) 

46 # break 

47 

48 return res_class + clean_keywords