Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Various functions to process text 

4""" 

5 

6import os 

7import re 

8import chardet 

9 

10 

11def replace_comma_by_point(file): 

12 """ 

13 Replaces all commas by point in a file (do that inplace). 

14 

15 :param file: file to process 

16 """ 

17 with open(file, "r") as f: 

18 text = f.read() 

19 text = text.replace(",", ".") 

20 with open(file, "w") as f: 

21 f.write(text) 

22 

23 

24def file_head(filename: str, nbline=10, encoding="utf8", errors="strict"): 

25 """ 

26 Extracts the first *nbline* of a file (assuming it is text file). 

27 

28 :param filename: filename 

29 :param nbline: number of lines 

30 :param encoding: encoding 

31 :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_ 

32 :return: list of lines 

33 """ 

34 if isinstance(filename, str): 

35 if not os.path.exists(filename): 

36 raise FileNotFoundError(filename) 

37 if not os.path.isfile(filename): 

38 raise FileNotFoundError( # pragma: no cover 

39 "'{0}' is not a file".format(filename)) 

40 with open(filename, "r", encoding=encoding, errors=errors) as f: 

41 return file_head(f, nbline=nbline, encoding=encoding) 

42 else: 

43 rows = [] 

44 for line in filename: 

45 rows.append(line) 

46 if len(rows) >= nbline: 

47 break 

48 return rows 

49 

50 

51def file_tail(filename: str, nbline=10, encoding="utf8", threshold=2 ** 14, errors="strict"): 

52 """ 

53 Extracts the first nbline of a file (assuming it is text file). 

54 

55 :param filename: filename 

56 :param nbline: number of lines 

57 :param encoding: encoding 

58 :param threshold: if the file size is above, it will not read the beginning 

59 :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_ 

60 :return: list of lines 

61 

62 The line marked as *A* has an issue because the cursor 

63 could fall on a character (= byte) in the middle of a character 

64 if the file is encoded in utf-8 character. 

65 The next line fails. That's why we try again 

66 by moving the cursor by one character (see line B). 

67 

68 The first returned line may be incomplete. 

69 """ 

70 if not os.path.exists(filename): 

71 raise FileNotFoundError(filename) # pragma: no cover 

72 if not os.path.isfile(filename): 

73 raise FileNotFoundError( # pragma: no cover 

74 "'{0}' is not a file".format(filename)) 

75 

76 size = os.stat(filename).st_size 

77 if size < threshold: 

78 with open(filename, "r", encoding=encoding, errors=errors) as f: 

79 rows = f.readlines() 

80 return rows[-nbline:] if len(rows) > nbline else rows 

81 else: 

82 with open(filename, "r", encoding=encoding, errors=errors) as f: 

83 f.seek(size - threshold) # line A 

84 try: 

85 content = f.read() 

86 except UnicodeDecodeError: 

87 f.seek(size - threshold - 1) # line B 

88 content = f.read() 

89 

90 rows = content.split("\n") 

91 res = rows[-nbline:] if len(rows) > nbline else rows 

92 return [_ + "\n" for _ in res] 

93 

94 

95def enumerate_grep(filename, regex, encoding="utf8", errors=None): 

96 """ 

97 Extracts lines matching a regular expression. 

98 

99 @param filename filename 

100 @param regex regular expression 

101 @param encoding encoding 

102 @param errors see `open <https://docs.python.org/3/library/functions.html#open>`_ 

103 @return iterator in lines 

104 

105 .. versionadded:: 1.1 

106 """ 

107 if isinstance(filename, str): 

108 if not os.path.exists(filename): 

109 raise FileNotFoundError(filename) # pragma: no cover 

110 if not os.path.isfile(filename): 

111 raise FileNotFoundError( # pragma: no cover 

112 "'{0}' is not a file".format(filename)) 

113 with open(filename, "r", encoding=encoding, errors=errors) as f: 

114 for _ in enumerate_grep(f, regex, encoding): 

115 yield _ 

116 else: 

117 reg = re.compile(regex) 

118 for line in filename: 

119 if reg.search(line): 

120 yield line 

121 

122 

123def file_encoding(filename_or_bytes, limit=2**20): 

124 """ 

125 Returns the encoding of a file. 

126 The function relies on `chardet <http://chardet.readthedocs.io/en/latest/usage.html>`_. 

127 

128 :param filename_or_bytes: filename or bytes 

129 :param limit: if *filename_or_bytes* is a file, the function only 

130 loads the first *limit* bytes (or all if *limit* is -1) 

131 :return: dictionary 

132 

133 Example of results: 

134 

135 :: 

136 

137 {'encoding': 'EUC-JP', 'confidence': 0.99} 

138 """ 

139 if isinstance(filename_or_bytes, str): 

140 if not os.path.exists(filename_or_bytes): 

141 raise FileNotFoundError(filename_or_bytes) 

142 if not os.path.isfile(filename_or_bytes): 

143 raise FileNotFoundError( 

144 "'{0}' is not a file".format(filename_or_bytes)) 

145 size = os.stat(filename_or_bytes).st_size 

146 with open(filename_or_bytes, "rb") as f: 

147 content = f.read() if limit == -1 or size < limit else f.read(limit) 

148 return file_encoding(content) 

149 elif isinstance(filename_or_bytes, bytes): 

150 return chardet.detect(filename_or_bytes) 

151 else: 

152 raise TypeError("Unexpecting type for filename_or_bytes, got: {0}.".format( 

153 type(filename_or_bytes)))