Coverage for src/ensae_teaching_cs/homeblog/program_helper.py: 96%

55 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-04-28 06:23 +0200

1""" 

2@file 

3@brief Various function about programs such as guessing the language of a code 

4""" 

5import re 

6 

7 

8def guess_language_code(code): 

9 """ 

10 Guess the language of a piece of code. 

11 The result can be: js, xml, html, cpp, py, sql, vba, css 

12 

13 @param code code 

14 @return type of language or None if None if not found, score (in [0,1], 1 is good) 

15 

16 The algorithm is to compare two languages bamong each others on keywords they don't have in common 

17 """ 

18 code = code.replace(" ", " ").replace( 

19 "\r", "").replace("\n", " ").replace("\t", " ") 

20 stripcode = code.strip() 

21 if stripcode.startswith("<html>") or \ 

22 stripcode.startswith("<xml") or \ 

23 stripcode.startswith("<!DOCTYPE html>"): 

24 return ('xml', 1.0) 

25 exp1 = re.compile("[^a-z]([a-z]{2,8})[^a-z0-9]") 

26 exp2 = re.compile("(</?[a-z]{2,8}( |>))") 

27 keywords = {"py": set(("format with len from numpy enumerate as and or ord range try except " + 

28 "raise for while if else elif with self assert " + 

29 "for in if not import del from map random sys append except in range elif " + 

30 "float str def raise except none").split()), 

31 "sql": set("on outer full as count and or desc asc from select group by order where join inner".split()), 

32 "xml": set("<body> <xml> </body> <script> <script </script> <head> </head> <meta> <meta </meta>".split()), 

33 "css": set("border font background size".split()), 

34 "vb": set("error for sub function while wend then to end next dim set".split()), 

35 "cpp": set(("ord try catch throw try for while if else push for foreach delete vector map if " + 

36 "catch void double string new throw null").split()), 

37 "js": set("try catch throw for while if else push for in if catch var throw new function null".split()), 

38 } 

39 comments = {"py": re.compile("#[^#]"), 

40 "sql": re.compile("--[^-]"), 

41 "css": re.compile("//[/]"), 

42 "vb": re.compile("'' "), 

43 "xml": re.compile("<!--[^-]"), 

44 } 

45 comments["cpp"] = comments["js"] = comments["css"] 

46 

47 mat = {} 

48 for k, v in keywords.items(): 

49 for k2, v2 in keywords.items(): 

50 if k == k2: 

51 continue 

52 inter = v.intersection(v2) 

53 vd = v - inter 

54 v2d = v2 - inter 

55 mat[k, k2] = (vd, v2d) 

56 if comments[k] != comments[k2]: 

57 mat[k, k2] += (comments[k], comments[k2]) 

58 

59 token = exp1.findall(code) + exp2.findall(code) 

60 

61 counts = {} 

62 for k, v in mat.items(): 

63 c = [0, 0, 0, 0, [], [], None, None] 

64 for t in token: 

65 if t in v[0]: 

66 c[0] += 1 

67 c[4].append(t) 

68 if t in v[1]: 

69 c[1] += 1 

70 c[5].append(t) 

71 if len(v) > 2: 

72 co1 = v[2].findall(code) 

73 co2 = v[3].findall(code) 

74 c[6] = co1 

75 c[7] = co2 

76 c[2], c[3] = len(co1), len(co2) 

77 counts[k] = c 

78 

79 # ~ for k in sorted(counts) : 

80 #~ print (k,counts[k]) 

81 # ~ if sum(counts[k][:4]) == 0 : 

82 #~ print (k, mat[k]) 

83 #~ print (token) 

84 

85 # we find a language which wins every battle 

86 better = {} 

87 for k, c in counts.items(): 

88 if c[0] + c[2] >= c[1] + c[3]: 

89 better[k[0]] = better.get(k[0], 0) + 1 

90 

91 #print (better) 

92 

93 li = [(v, k) for k, v in better.items()] 

94 li.sort() 

95 if len(li) > 0: 

96 if li[-1][0] == len(keywords) - 1 and (len(li) == 1 or li[-2][0] < len(keywords) - 1): 

97 ans = li[-1][1] 

98 sh = [(v, k) for k, v in counts.items() if k[0] == ans] 

99 co = [((v[0] + v[2]) / sum(v[:4]), k) for v, k in sh] 

100 co.sort() 

101 #print (co) 

102 return (ans, co[0][0]) 

103 else: 

104 return None 

105 else: 

106 return None