Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# coding: latin-1 

2""" 

3@file 

4 

5@brief contains a class which iterations on rows of a text file structured as a table. 

6 

7""" 

8 

9 

10import re 

11import os 

12import decimal 

13 

14from pyquickhelper.loghelper import noLOG 

15from pyquickhelper.loghelper.flog import GetPath 

16from .file_text_binary import TextFile 

17 

18 

19class TextFileColumns(TextFile): 

20 """ 

21 This class opens a text file as if it were a binary file. It can deal with null characters. 

22 The file is interpreted as a TSV file or file containing columns. 

23 The separator is found automatically. 

24 The columns are assumed to be in the first line but it is not mandatory. 

25 It walks along a file through an iterator, every line is automatically converted into a dictionary ``{ column : value }``. 

26 If the class was able to guess what type is which column, the conversion will automatically take place. 

27 

28 :: 

29 

30 f = TextFileColumns(filename) 

31 # filename is a file 

32 # the separator is unknown --> the class automatically determines it 

33 # as well as the columns and their type 

34 f.open() 

35 for d in f: 

36 print(d) # d is a dictionary 

37 f.close() 

38 

39 @var _force_header there is a header even if not detected 

40 @var _force_noheader there is no header even if detected 

41 @var _changes replace the columns name 

42 @var _regexfix impose a regular expression to interpret a line instead of the automatically built one 

43 @var _filter_dict it is a function which takes a dictionary and returns a boolean which tells if the line must considered or not 

44 @var _fields name of the columns (if there is no header) 

45 

46 Spaces and non-ascii characters cannot be used to name a column. 

47 This name must be a named group for a regular expression. 

48 """ 

49 

50 def __init__(self, filename, errors=None, fLOG=noLOG, force_header=False, changes=None, 

51 force_noheader=False, regex=None, filter=None, fields=None, 

52 keep_text_when_bad_type=False, break_at=-1, strip_space=True, 

53 force_sep=None, nb_line_guess=100, mistake=3, encoding="utf-8", 

54 strict_separator=False): 

55 """ 

56 @param filename filename 

57 @param errors see str (errors = ...) 

58 @param fLOG LOG function, see `fLOG 

59 <http://www.xavierdupre.fr/app/pyquickhelper/helpsphinx/ 

60 pyquickhelper/loghelper/flog.html#pyquickhelper.loghelper.flog.fLOG>`_ 

61 @param force_header defines the first line as columns header whatever is it relevant or not 

62 @param changes to change the column name, gives the correspondence, example: { "query":"query___" }, 

63 it can be a list if there is no header and you want to name any column 

64 @param force_noheader there is no header at all 

65 @param regex specify a different regular expression (only if changes is a list) 

66 if it is a dictionary, the class will replace the default 

67 by the one associated in regex for this field 

68 @param filter None if there is no filter, otherwise it is a function 

69 which takes a dictionary and returns a boolean 

70 which tells if the line must considered or not 

71 @param fields when the header is not here, these fields will name the columns 

72 @param keep_text_when_bad_type keep the value when the conversion type does not word 

73 @param break_at if != -1, stop when this limit is reached 

74 @param strip_space remove space around columns if True 

75 @param force_sep if != None, impose a column separator 

76 @param nb_line_guess number of lines used to guess types 

77 @param mistake not more than mistake conversion in numbers are allowed 

78 @param encoding encoding 

79 @param strict_separator strict number of columns, it assumes there is no separator in the content of every column 

80 """ 

81 if changes is None: 

82 changes = {} 

83 

84 TextFile.__init__(self, filename, errors, fLOG=fLOG, encoding=encoding) 

85 

86 self._force_header = force_header 

87 self._force_noheader = force_noheader 

88 self._changes = changes 

89 self._regexfix = regex 

90 self._filter_dict = filter 

91 self._fields = fields 

92 self._keep_text_when_bad_type = keep_text_when_bad_type 

93 self._break_at = break_at 

94 self._strip_space = strip_space 

95 self._force_sep = force_sep 

96 self._nb_guess_line = nb_line_guess 

97 self._mistake = mistake 

98 self._strict_separator = strict_separator 

99 self._encoding = encoding 

100 

101 if isinstance(changes, list): 

102 hhhh, _ = 0, len(changes) 

103 while _ > 0: 

104 hhhh, _ = hhhh, _ / 10 # pylint: disable=W0127 

105 forma_ = "c%0" + str(hhhh) + "d" 

106 

107 self._changes = {} 

108 for i, c in enumerate(changes): 

109 self._changes[forma_ % i] = c 

110 

111 if self._regexfix is not None and \ 

112 not isinstance(self._regexfix, dict) and \ 

113 "(?P<" not in self._regexfix: 

114 reg = re.compile("[(](.+?)[)]") 

115 fi = reg.findall(self._regexfix) 

116 if len(fi) != len(changes): 

117 raise Exception( 

118 "not the same number of fields in regular expression (%d,%d):\n%s\n%s" % 

119 (len(fi), len(changes), str(fi), str(changes))) 

120 exp = [] 

121 for a, b in zip(fi, changes): 

122 s = "(?P<%s>%s)" % (b, a) 

123 exp.append(s) 

124 p = self._regexfix.find(")") + 1 

125 s = self._regexfix[p] 

126 self._regexfix = s.join(exp) 

127 self.LOG("split: ", fi) 

128 self.LOG("new regex: ", self._regexfix) 

129 else: 

130 self.LOG(" TextFileColumns (1): regex: ", self._regexfix) 

131 else: 

132 self.LOG(" TextFileColumns (2): regex: ", self._regexfix) 

133 

134 def __str__(self): 

135 """ 

136 Returns the header. 

137 """ 

138 return str(self.__dict__) 

139 

140 def get_columns(self): 

141 """ 

142 @return the columns 

143 """ 

144 if "_columns" not in self.__dict__: 

145 raise Exception("there is no available columns") 

146 return self._columns 

147 

148 def open(self): 

149 """ 

150 Opens the file and find out if there is a header, what are the columns, what are their type... 

151 any information about which format was found is logged. 

152 """ 

153 if "_header" not in self.__dict__: 

154 header, columns, sep, regex = self.guess_columns(force_header=self._force_header, 

155 changes=self._changes, 

156 force_noheader=self._force_noheader, 

157 fields=self._fields, 

158 regex=self._regexfix if isinstance( 

159 self._regexfix, 

160 dict) else {}, 

161 force_sep=self._force_sep, 

162 nb=self._nb_guess_line, 

163 mistake=self._mistake) 

164 if self._regexfix is not None and not isinstance( 

165 self._regexfix, dict): 

166 regex = self._regexfix 

167 self._header = header 

168 self._columns = columns 

169 self._sep = sep 

170 try: 

171 self._regex = re.compile(regex) 

172 except Exception as e: 

173 raise RuntimeError( # pylint: disable=W0707 

174 "algorithm problem: (type %r, %r)\nunable to understand a regular expression (file %r)\nexp: %r" % 

175 (str(type(e)), str(e), self.filename, regex)) 

176 self._name = {} 

177 self._nb = 0 

178 self._conv = {} 

179 for k, v in self._columns.items(): 

180 self._name[v[0]] = (k, v[1]) 

181 if v[1] in [int, float, decimal.Decimal]: 

182 self._conv[v[0]] = v[1] 

183 self._nb += 1 

184 TextFile.open(self) 

185 

186 def close(self): 

187 """ 

188 Closes the file and remove all information related to the format, 

189 next time it is opened, the format will be checked again. 

190 """ 

191 TextFile.close(self) 

192 self._nb -= 1 

193 if self._nb == 0: 

194 del self.__dict__["_header"] 

195 del self.__dict__["_columns"] 

196 del self.__dict__["_regex"] 

197 del self.__dict__["_name"] 

198 del self.__dict__["_conv"] 

199 

200 def __iter__(self): 

201 """ 

202 @return a dictionary ``{ column_name: value }`` 

203 """ 

204 class tempo__: 

205 

206 def __init__(self, r): 

207 self.res = r 

208 

209 def groupdict(self): 

210 return self.res 

211 

212 if "_header" not in self.__dict__: 

213 raise Exception("file not open %s" % self.filename) 

214 

215 regex_simple = re.compile(self._regex.pattern.replace(">.*)", ">.*?)")) 

216 

217 nb = 0 

218 nberr = 0 

219 nbert = 0 

220 for line in TextFile.__iter__(self): 

221 if nb == 0 and self._header: 

222 nb += 1 

223 continue 

224 

225 tempc = line.split(self._sep) 

226 

227 if len(tempc) == len(self._columns): 

228 res = {} 

229 for i, a in enumerate(tempc): 

230 res[self._columns[i][0]] = a 

231 r = tempo__(res) 

232 elif not self._strict_separator: 

233 if len(tempc) < len(self._columns): 

234 # impossible 

235 r = None 

236 else: 

237 # conflicts... 

238 r = regex_simple.match(line) 

239 if r is None: 

240 r = self._regex.match(line) 

241 else: 

242 r = None 

243 

244 if r is None: 

245 if nberr == 0: 

246 self.LOG(self._regex.pattern) 

247 self.LOG( 

248 "error regex", 

249 nberr, 

250 "unable to interpret line ", 

251 nb, 

252 ": ", 

253 repr(line)) 

254 nberr += 1 

255 if nberr * 10 > nb and nberr > 4: 

256 message = "pattern: %s\n line: %s" % ( 

257 regex_simple.pattern, line) 

258 raise Exception( 

259 "(a) there are probably too many errors %d (%d)\n%s" % 

260 (nberr, nb, message)) 

261 else: 

262 res = r.groupdict() 

263 if self._strip_space: 

264 for k in res: 

265 res[k] = res[k].strip() 

266 giveup = False 

267 

268 for k in res: 

269 if k in self._conv: 

270 try: 

271 if len(res[k]) == 0 and (self._conv[k] == int or self._conv[ 

272 k] == float or self._conv[k] == decimal.Decimal): 

273 ttt = self._conv[k](0) 

274 else: 

275 ttt = self._conv[k](res[k]) 

276 res[k] = ttt 

277 except ValueError: 

278 nbert += 1 

279 if self._keep_text_when_bad_type: 

280 if nbert % 1000 == 1: 

281 self.LOG( 

282 "error type", 

283 nbert, 

284 "unable to interpret line (but keep it) ", 

285 nb, 

286 "value", 

287 repr( 

288 res[k]), 

289 " type ", 

290 repr( 

291 self._conv[k]), 

292 " line ", 

293 repr(line)) 

294 else: 

295 self.LOG( 

296 "error type", nbert, "unable to interpret line ", nb, "value", repr( 

297 res[k]), " type ", repr( 

298 self._conv[k]), " line ", repr(line)) 

299 if nbert * 10 > nb and nbert > 4: 

300 message = "pattern: %s\n line: %s" % ( 

301 regex_simple.pattern, line) 

302 raise RuntimeError( # pylint: disable=W0707 

303 "(b) there are probably too many errors %r\n%r" % 

304 (nberr, message)) 

305 giveup = True 

306 break 

307 if giveup: 

308 continue 

309 if self._filter_dict is None or self._filter_dict(res): 

310 yield res 

311 

312 nb += 1 

313 if self._break_at != -1 and nb > self._break_at: 

314 break 

315 

316 @staticmethod 

317 def _store(output, la, encoding="utf-8"): 

318 """ 

319 Stores a list of dictionaries into a file (add a header). 

320 

321 @param output filename 

322 @param la list of dictionary key:value 

323 @param encoding encoding 

324 @warning format is utf-8 

325 """ 

326 sepline = "\n" # GetSepLine () 

327 f = open(output, "w", encoding=encoding) 

328 nbline = 0 

329 for d in la: 

330 if nbline == 0: 

331 keys = list(d.keys()) 

332 keys.sort() 

333 f.write("\t".join(keys) + sepline) 

334 

335 val = [str(d[k]) for k in keys] 

336 s = "\t".join(val) 

337 f.write(s + sepline) 

338 

339 nbline += 1 

340 f.close() 

341 

342 def sort(self, output, key, maxmemory=2 ** 28, folder=None, fLOG=noLOG): 

343 """ 

344 Sorts a text file, even a big one, one or several columns gives the order. 

345 

346 @param output output file result 

347 @param key lines sorted depending of these columns 

348 @param maxmemory a file is split into smaller files which contains not more than maxmemory lines 

349 @param folder the function needs to create temporary files, this folder will contain them 

350 before they get removed 

351 @param fLOG logging function 

352 @return 

353 

354 @warning We assume this file is not opened. 

355 """ 

356 if isinstance(key, str): 

357 key = (key,) 

358 if folder is None: 

359 folder = GetPath() 

360 if not os.path.exists(folder): 

361 raise Exception("unable to find folder %s" % folder) 

362 

363 try: 

364 file = open(output, "w", encoding=self._encoding) 

365 file.close() 

366 except Exception as e: 

367 raise RuntimeError( # pylint: disable=W0707 

368 "Unable to create file %r, reason: %r" % 

369 (output, str(e))) 

370 

371 self.LOG("sorting file ", self.filename) 

372 #root = self.filename.replace (":", "_").replace ("/", "_").replace ("\\", "_").replace (".", "_") 

373 files = [] 

374 memo = [] 

375 self.open() 

376 for line in self: 

377 try: 

378 k = tuple(line[k] for k in key) 

379 except KeyError as e: 

380 raise Exception("unable to find one column in\n{0}".format( 

381 self.get_columns())) from e 

382 memo.append((k, line)) 

383 if len(memo) > maxmemory: 

384 memo.sort(key=lambda el: el[0]) 

385 memo = [la[1] for la in memo] 

386 tempout = os.path.join(folder, "root_%05d.txt" % len(files)) 

387 self.LOG("writing file %d lines in " % len(memo), tempout) 

388 TextFileColumns._store(tempout, memo) 

389 files.append(tempout) 

390 memo = [] 

391 

392 if len(memo) > 0: 

393 memo.sort(key=lambda el: el[0]) 

394 memo = [la[1] for la in memo] 

395 tempout = os.path.join(folder, "root_%05d.txt" % len(files)) 

396 self.LOG("writing file %d lines in " % len(memo), tempout) 

397 TextFileColumns._store(tempout, memo) 

398 files.append(tempout) 

399 memo = [] 

400 

401 self.close() 

402 

403 TextFileColumns.fusion( 

404 key, 

405 files, 

406 output, 

407 force_header=self._force_header, 

408 fLOG=self.LOG) 

409 for m in files: 

410 self.LOG("removing ", m) 

411 os.remove(m) 

412 

413 @staticmethod 

414 def fusion(key, files, output, force_header=False, encoding="utf-8", fLOG=noLOG): 

415 """ 

416 Does a fusion between several files 

417 with the same columns (different order is allowed). 

418 

419 @param key columns to be compared 

420 @param files list of files 

421 @param output output file 

422 @param force_header impose the first line as a header 

423 @param encoding encoding 

424 @param fLOG logging function 

425 @warning We assume all files are sorted depending on columns in key 

426 """ 

427 fh = [] 

428 for f in files: 

429 h = TextFileColumns(f, force_header=force_header, 

430 encoding=encoding, fLOG=fLOG) 

431 h.open() 

432 fh.append([h, iter(h)]) 

433 

434 res = open(output, "w", encoding=encoding) 

435 nbline = 0 

436 sepline = "\n" # GetSepLine () 

437 if isinstance(key, str): 

438 key = [key] 

439 

440 # start 

441 kline = [] 

442 for li in fh: 

443 try: 

444 if li[1] is None: 

445 d = None 

446 else: 

447 d = li[1].__next__() 

448 except StopIteration: 

449 d = None 

450 if d is not None: 

451 try: 

452 k = tuple(d[k] for k in key) 

453 except KeyError as e: 

454 raise Exception("unable to find one column in\n{0}".format( 

455 li[0].get_columns())) from e 

456 kline.append([k, d] + li) 

457 

458 # loop 

459 while len(kline) > 0: 

460 

461 # minimum 

462 mi = None 

463 for i, line in enumerate(kline): 

464 if mi is None or line[0] < mi: 

465 mi = line[0] 

466 pos = i 

467 

468 # picking 

469 line = kline[pos] 

470 del kline[pos] 

471 

472 # adding 

473 d = line[1] 

474 if nbline == 0: 

475 keys = list(d.keys()) 

476 keys.sort() 

477 res.write("\t".join(keys) + sepline) 

478 

479 val = [str(d[k_]) for k_ in keys] 

480 s = "\t".join(val) 

481 res.write(s + sepline) 

482 nbline += 1 

483 

484 # next 

485 try: 

486 d = line[-1].__next__() 

487 except StopIteration: 

488 d = None 

489 

490 if d is not None: 

491 k = tuple(d[k_] for k_ in key) 

492 kline.append([k, d] + line[2:]) 

493 

494 # end 

495 for li in fh: 

496 li[0].close() 

497 res.close()