Coverage for src/pyensae/sql/file_text_binary

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# coding: latin-1

2"""

3@file

5@brief contains a class which iterations on rows of a text file structured as a table.

7"""

10import re

11import os

12import decimal

14from pyquickhelper.loghelper import noLOG

15from pyquickhelper.loghelper.flog import GetPath

16from .file_text_binary import TextFile

19class TextFileColumns(TextFile):

20 """

21 This class opens a text file as if it were a binary file. It can deal with null characters.

22 The file is interpreted as a TSV file or file containing columns.

23 The separator is found automatically.

24 The columns are assumed to be in the first line but it is not mandatory.

25 It walks along a file through an iterator, every line is automatically converted into a dictionary ``{ column : value }``.

26 If the class was able to guess what type is which column, the conversion will automatically take place.

28 ::

30 f = TextFileColumns(filename)

31 # filename is a file

32 # the separator is unknown --> the class automatically determines it

33 # as well as the columns and their type

34 f.open()

35 for d in f:

36 print(d) # d is a dictionary

37 f.close()

39 @var _force_header there is a header even if not detected

40 @var _force_noheader there is no header even if detected

41 @var _changes replace the columns name

42 @var _regexfix impose a regular expression to interpret a line instead of the automatically built one

43 @var _filter_dict it is a function which takes a dictionary and returns a boolean which tells if the line must considered or not

44 @var _fields name of the columns (if there is no header)

46 Spaces and non-ascii characters cannot be used to name a column.

47 This name must be a named group for a regular expression.

48 """

50 def __init__(self, filename, errors=None, fLOG=noLOG, force_header=False, changes=None,

51 force_noheader=False, regex=None, filter=None, fields=None,

52 keep_text_when_bad_type=False, break_at=-1, strip_space=True,

53 force_sep=None, nb_line_guess=100, mistake=3, encoding="utf-8",

54 strict_separator=False):

55 """

56 @param filename filename

57 @param errors see str (errors = ...)

58 @param fLOG LOG function, see `fLOG

59 <http://www.xavierdupre.fr/app/pyquickhelper/helpsphinx/

60 pyquickhelper/loghelper/flog.html#pyquickhelper.loghelper.flog.fLOG>`_

61 @param force_header defines the first line as columns header whatever is it relevant or not

62 @param changes to change the column name, gives the correspondence, example: { "query":"query___" },

63 it can be a list if there is no header and you want to name any column

64 @param force_noheader there is no header at all

65 @param regex specify a different regular expression (only if changes is a list)

66 if it is a dictionary, the class will replace the default

67 by the one associated in regex for this field

68 @param filter None if there is no filter, otherwise it is a function

69 which takes a dictionary and returns a boolean

70 which tells if the line must considered or not

71 @param fields when the header is not here, these fields will name the columns

72 @param keep_text_when_bad_type keep the value when the conversion type does not word

73 @param break_at if != -1, stop when this limit is reached

74 @param strip_space remove space around columns if True

75 @param force_sep if != None, impose a column separator

76 @param nb_line_guess number of lines used to guess types

77 @param mistake not more than mistake conversion in numbers are allowed

78 @param encoding encoding

79 @param strict_separator strict number of columns, it assumes there is no separator in the content of every column

80 """

81 if changes is None:

82 changes = {}

84 TextFile.__init__(self, filename, errors, fLOG=fLOG, encoding=encoding)

86 self._force_header = force_header

87 self._force_noheader = force_noheader

88 self._changes = changes

89 self._regexfix = regex

90 self._filter_dict = filter

91 self._fields = fields

92 self._keep_text_when_bad_type = keep_text_when_bad_type

93 self._break_at = break_at

94 self._strip_space = strip_space

95 self._force_sep = force_sep

96 self._nb_guess_line = nb_line_guess

97 self._mistake = mistake

98 self._strict_separator = strict_separator

99 self._encoding = encoding

100

101 if isinstance(changes, list):

102 hhhh, _ = 0, len(changes)

103 while _ > 0:

104 hhhh, _ = hhhh, _ / 10 # pylint: disable=W0127

105 forma_ = "c%0" + str(hhhh) + "d"

106

107 self._changes = {}

108 for i, c in enumerate(changes):

109 self._changes[forma_ % i] = c

110

111 if self._regexfix is not None and \

112 not isinstance(self._regexfix, dict) and \

113 "(?P<" not in self._regexfix:

114 reg = re.compile("[(](.+?)[)]")

115 fi = reg.findall(self._regexfix)

116 if len(fi) != len(changes):

117 raise Exception(

118 "not the same number of fields in regular expression (%d,%d):\n%s\n%s" %

119 (len(fi), len(changes), str(fi), str(changes)))

120 exp = []

121 for a, b in zip(fi, changes):

122 s = "(?P<%s>%s)" % (b, a)

123 exp.append(s)

124 p = self._regexfix.find(")") + 1

125 s = self._regexfix[p]

126 self._regexfix = s.join(exp)

127 self.LOG("split: ", fi)

128 self.LOG("new regex: ", self._regexfix)

129 else:

130 self.LOG(" TextFileColumns (1): regex: ", self._regexfix)

131 else:

132 self.LOG(" TextFileColumns (2): regex: ", self._regexfix)

133

134 def __str__(self):

135 """

136 Returns the header.

137 """

138 return str(self.__dict__)

139

140 def get_columns(self):

141 """

142 @return the columns

143 """

144 if "_columns" not in self.__dict__:

145 raise Exception("there is no available columns")

146 return self._columns

147

148 def open(self):

149 """

150 Opens the file and find out if there is a header, what are the columns, what are their type...

151 any information about which format was found is logged.

152 """

153 if "_header" not in self.__dict__:

154 header, columns, sep, regex = self.guess_columns(force_header=self._force_header,

155 changes=self._changes,

156 force_noheader=self._force_noheader,

157 fields=self._fields,

158 regex=self._regexfix if isinstance(

159 self._regexfix,

160 dict) else {},

161 force_sep=self._force_sep,

162 nb=self._nb_guess_line,

163 mistake=self._mistake)

164 if self._regexfix is not None and not isinstance(

165 self._regexfix, dict):

166 regex = self._regexfix

167 self._header = header

168 self._columns = columns

169 self._sep = sep

170 try:

171 self._regex = re.compile(regex)

172 except Exception as e:

173 raise RuntimeError( # pylint: disable=W0707

174 "algorithm problem: (type %r, %r)\nunable to understand a regular expression (file %r)\nexp: %r" %

175 (str(type(e)), str(e), self.filename, regex))

176 self._name = {}

177 self._nb = 0

178 self._conv = {}

179 for k, v in self._columns.items():

180 self._name[v[0]] = (k, v[1])

181 if v[1] in [int, float, decimal.Decimal]:

182 self._conv[v[0]] = v[1]

183 self._nb += 1

184 TextFile.open(self)

185

186 def close(self):

187 """

188 Closes the file and remove all information related to the format,

189 next time it is opened, the format will be checked again.

190 """

191 TextFile.close(self)

192 self._nb -= 1

193 if self._nb == 0:

194 del self.__dict__["_header"]

195 del self.__dict__["_columns"]

196 del self.__dict__["_regex"]

197 del self.__dict__["_name"]

198 del self.__dict__["_conv"]

199

200 def __iter__(self):

201 """

202 @return a dictionary ``{ column_name: value }``

203 """

204 class tempo__:

205

206 def __init__(self, r):

207 self.res = r

208

209 def groupdict(self):

210 return self.res

211

212 if "_header" not in self.__dict__:

213 raise Exception("file not open %s" % self.filename)

214

215 regex_simple = re.compile(self._regex.pattern.replace(">.*)", ">.*?)"))

216

217 nb = 0

218 nberr = 0

219 nbert = 0

220 for line in TextFile.__iter__(self):

221 if nb == 0 and self._header:

222 nb += 1

223 continue

224

225 tempc = line.split(self._sep)

226

227 if len(tempc) == len(self._columns):

228 res = {}

229 for i, a in enumerate(tempc):

230 res[self._columns[i][0]] = a

231 r = tempo__(res)

232 elif not self._strict_separator:

233 if len(tempc) < len(self._columns):

234 # impossible

235 r = None

236 else:

237 # conflicts...

238 r = regex_simple.match(line)

239 if r is None:

240 r = self._regex.match(line)

241 else:

242 r = None

243

244 if r is None:

245 if nberr == 0:

246 self.LOG(self._regex.pattern)

247 self.LOG(

248 "error regex",

249 nberr,

250 "unable to interpret line ",

251 nb,

252 ": ",

253 repr(line))

254 nberr += 1

255 if nberr * 10 > nb and nberr > 4:

256 message = "pattern: %s\n line: %s" % (

257 regex_simple.pattern, line)

258 raise Exception(

259 "(a) there are probably too many errors %d (%d)\n%s" %

260 (nberr, nb, message))

261 else:

262 res = r.groupdict()

263 if self._strip_space:

264 for k in res:

265 res[k] = res[k].strip()

266 giveup = False

267

268 for k in res:

269 if k in self._conv:

270 try:

271 if len(res[k]) == 0 and (self._conv[k] == int or self._conv[

272 k] == float or self._conv[k] == decimal.Decimal):

273 ttt = self._conv[k](0)

274 else:

275 ttt = self._conv[k](res[k])

276 res[k] = ttt

277 except ValueError:

278 nbert += 1

279 if self._keep_text_when_bad_type:

280 if nbert % 1000 == 1:

281 self.LOG(

282 "error type",

283 nbert,

284 "unable to interpret line (but keep it) ",

285 nb,

286 "value",

287 repr(

288 res[k]),

289 " type ",

290 repr(

291 self._conv[k]),

292 " line ",

293 repr(line))

294 else:

295 self.LOG(

296 "error type", nbert, "unable to interpret line ", nb, "value", repr(

297 res[k]), " type ", repr(

298 self._conv[k]), " line ", repr(line))

299 if nbert * 10 > nb and nbert > 4:

300 message = "pattern: %s\n line: %s" % (

301 regex_simple.pattern, line)

302 raise RuntimeError( # pylint: disable=W0707

303 "(b) there are probably too many errors %r\n%r" %

304 (nberr, message))

305 giveup = True

306 break

307 if giveup:

308 continue

309 if self._filter_dict is None or self._filter_dict(res):

310 yield res

311

312 nb += 1

313 if self._break_at != -1 and nb > self._break_at:

314 break

315

316 @staticmethod

317 def _store(output, la, encoding="utf-8"):

318 """

319 Stores a list of dictionaries into a file (add a header).

320

321 @param output filename

322 @param la list of dictionary key:value

323 @param encoding encoding

324 @warning format is utf-8

325 """

326 sepline = "\n" # GetSepLine ()

327 f = open(output, "w", encoding=encoding)

328 nbline = 0

329 for d in la:

330 if nbline == 0:

331 keys = list(d.keys())

332 keys.sort()

333 f.write("\t".join(keys) + sepline)

334

335 val = [str(d[k]) for k in keys]

336 s = "\t".join(val)

337 f.write(s + sepline)

338

339 nbline += 1

340 f.close()

341

342 def sort(self, output, key, maxmemory=2 ** 28, folder=None, fLOG=noLOG):

343 """

344 Sorts a text file, even a big one, one or several columns gives the order.

345

346 @param output output file result

347 @param key lines sorted depending of these columns

348 @param maxmemory a file is split into smaller files which contains not more than maxmemory lines

349 @param folder the function needs to create temporary files, this folder will contain them

350 before they get removed

351 @param fLOG logging function

352 @return

353

354 @warning We assume this file is not opened.

355 """

356 if isinstance(key, str):

357 key = (key,)

358 if folder is None:

359 folder = GetPath()

360 if not os.path.exists(folder):

361 raise Exception("unable to find folder %s" % folder)

362

363 try:

364 file = open(output, "w", encoding=self._encoding)

365 file.close()

366 except Exception as e:

367 raise RuntimeError( # pylint: disable=W0707

368 "Unable to create file %r, reason: %r" %

369 (output, str(e)))

370

371 self.LOG("sorting file ", self.filename)

372 #root = self.filename.replace (":", "_").replace ("/", "_").replace ("\\", "_").replace (".", "_")

373 files = []

374 memo = []

375 self.open()

376 for line in self:

377 try:

378 k = tuple(line[k] for k in key)

379 except KeyError as e:

380 raise Exception("unable to find one column in\n{0}".format(

381 self.get_columns())) from e

382 memo.append((k, line))

383 if len(memo) > maxmemory:

384 memo.sort(key=lambda el: el[0])

385 memo = [la[1] for la in memo]

386 tempout = os.path.join(folder, "root_%05d.txt" % len(files))

387 self.LOG("writing file %d lines in " % len(memo), tempout)

388 TextFileColumns._store(tempout, memo)

389 files.append(tempout)

390 memo = []

391

392 if len(memo) > 0:

393 memo.sort(key=lambda el: el[0])

394 memo = [la[1] for la in memo]

395 tempout = os.path.join(folder, "root_%05d.txt" % len(files))

396 self.LOG("writing file %d lines in " % len(memo), tempout)

397 TextFileColumns._store(tempout, memo)

398 files.append(tempout)

399 memo = []

400

401 self.close()

402

403 TextFileColumns.fusion(

404 key,

405 files,

406 output,

407 force_header=self._force_header,

408 fLOG=self.LOG)

409 for m in files:

410 self.LOG("removing ", m)

411 os.remove(m)

412

413 @staticmethod

414 def fusion(key, files, output, force_header=False, encoding="utf-8", fLOG=noLOG):

415 """

416 Does a fusion between several files

417 with the same columns (different order is allowed).

418

419 @param key columns to be compared

420 @param files list of files

421 @param output output file

422 @param force_header impose the first line as a header

423 @param encoding encoding

424 @param fLOG logging function

425 @warning We assume all files are sorted depending on columns in key

426 """

427 fh = []

428 for f in files:

429 h = TextFileColumns(f, force_header=force_header,

430 encoding=encoding, fLOG=fLOG)

431 h.open()

432 fh.append([h, iter(h)])

433

434 res = open(output, "w", encoding=encoding)

435 nbline = 0

436 sepline = "\n" # GetSepLine ()

437 if isinstance(key, str):

438 key = [key]

439

440 # start

441 kline = []

442 for li in fh:

443 try:

444 if li[1] is None:

445 d = None

446 else:

447 d = li[1].__next__()

448 except StopIteration:

449 d = None

450 if d is not None:

451 try:

452 k = tuple(d[k] for k in key)

453 except KeyError as e:

454 raise Exception("unable to find one column in\n{0}".format(

455 li[0].get_columns())) from e

456 kline.append([k, d] + li)

457

458 # loop

459 while len(kline) > 0:

460

461 # minimum

462 mi = None

463 for i, line in enumerate(kline):

464 if mi is None or line[0] < mi:

465 mi = line[0]

466 pos = i

467

468 # picking

469 line = kline[pos]

470 del kline[pos]

471

472 # adding

473 d = line[1]

474 if nbline == 0:

475 keys = list(d.keys())

476 keys.sort()

477 res.write("\t".join(keys) + sepline)

478

479 val = [str(d[k_]) for k_ in keys]

480 s = "\t".join(val)

481 res.write(s + sepline)

482 nbline += 1

483

484 # next

485 try:

486 d = line[-1].__next__()

487 except StopIteration:

488 d = None

489

490 if d is not None:

491 k = tuple(d[k_] for k_ in key)

492 kline.append([k, d] + line[2:])

493

494 # end

495 for li in fh:

496 li[0].close()

497 res.close()

Coverage for src/pyensae/sql/file_text_binary_columns.py : 75%

264 statements

Coverage for src/pyensae/sql/file_text_binary_columns.py : 75%

264 statements 198 run 66 missing 0 excluded

264 statements