Coverage for src/papierstat/datasets/carreau.py: 95%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Jeux de données reliés aux données carroyées.

5"""

6import os

7import zipfile

8import collections

9import datetime

10import tempfile

11from io import BytesIO

12from dbfread.field_parser import FieldParser

13from dbfread import DBF

14from dbfread.dbf import expand_year

15import pandas

16from .data_helper import get_data_folder

19class DBFInMemory(DBF):

20 """

21 Overwrites `DBF <https://github.com/olemb/dbfread/blob/master/dbfread/dbf.py#L77>`_

22 to read data from memory and not from a file. The object

23 `DBF <http://dbfread.readthedocs.io/en/latest/dbf_objects.html>`_

24 needs a file by default. This class avoids creating an intermediate

25 file when the data is compressed in a :epkg:`zip` file.

26 """

28 def __init__(self, filename, encoding=None, ignorecase=True,

29 lowernames=False, parserclass=FieldParser,

30 recfactory=collections.OrderedDict,

31 load=False, raw=False, ignore_missing_memofile=False,

32 char_decode_errors='strict'):

34 if isinstance(filename, str):

35 DBF.__init__(self, filename, encoding=encoding, ignorecase=ignorecase,

36 lowernames=lowernames, parserclass=parserclass,

37 recfactory=recfactory, load=load,

38 raw=raw, ignore_missing_memofile=ignore_missing_memofile,

39 char_decode_errors=char_decode_errors)

40 else:

41 self.encoding = encoding

42 self.ignorecase = ignorecase

43 self.lowernames = lowernames

44 self.parserclass = parserclass

45 self.raw = raw

46 self.ignore_missing_memofile = ignore_missing_memofile

47 self.char_decode_errors = char_decode_errors

49 if recfactory is None:

50 self.recfactory = lambda items: items

51 else:

52 self.recfactory = recfactory

54 self.name = None

55 self.filename = None

56 self.content = filename

58 self._records = None

59 self._deleted = None

61 # Filled in by self._read_headers()

62 self.memofilename = None

63 self.header = None

64 self.fields = [] # namedtuples

65 self.field_names = [] # strings

67 obj = BytesIO(filename)

68 self._read_header(obj)

69 self._read_field_headers(obj)

70 self._check_headers()

72 try:

73 self.date = datetime.date(expand_year(self.header.year),

74 self.header.month, self.header.day)

75 except ValueError: # pragma: no cover

76 # Invalid date or '\x00\x00\x00'.

77 self.date = None

79 self.memofilename = self._get_memofilename()

81 if load:

82 self.load()

84 def _iter_records(self, record_type=b' '):

85 infile = BytesIO(self.content)

86 with self._open_memofile() as memofile:

88 # Skip to first record.

89 infile.seek(self.header.headerlen, 0)

91 if not self.raw:

92 field_parser = self.parserclass(self, memofile)

93 parse = field_parser.parse

95 # Shortcuts for speed.

96 skip_record = self._skip_record

97 read = infile.read

99 while True:

100 sep = read(1)

101

102 if sep == record_type:

103 if self.raw:

104 items = [(field.name, read(field.length))

105 for field in self.fields]

106 else:

107 items = [(field.name,

108 parse(field, read(field.length)))

109 for field in self.fields]

110

111 yield self.recfactory(items)

112

113 elif sep in (b'\x1a', b''):

114 # End of records.

115 break

116 else:

117 skip_record(infile)

118

119

120def load_dbf_from_zip(filename):

121 """

122 Loads a *.dbf* file compressed into a zip file.

123 It only takes the first *.dbf* file from the zip.

124

125 @param filename zip file

126 @return dataframe

127 """

128 with zipfile.ZipFile(filename) as myzip:

129 names0 = myzip.infolist()

130 names = [_.filename for _ in names0 if _.filename.endswith(".dbf")]

131 if len(names) == 0:

132 raise FileNotFoundError( # pragma: no cover

133 "No dbf file in '{0}'".format(filename))

134 with myzip.open(names[0], "r") as f:

135 content = f.read()

136 data = list(DBFInMemory(content))

137 return pandas.DataFrame(data)

138

139

140def _read_geopandas_from_bytes(mif, mid, **kwargs):

141 """

142 Returns a :epkg:`GeoDataFrame` from two sequences of bytes,

143 one for file *.mif*, one from file *.mid*.

144 Unfortunately, :epkg:`geopandas` does not read from

145 a buffer, and :epkg:`fiona` does it after writing

146 in a virtual file (not clear if it is a temporary file or not).

147 """

148 # Delayed import because the import fails sometimes

149 # on Windows.

150 from geopandas import GeoDataFrame

151

152 with tempfile.NamedTemporaryFile(mode='w+b', delete=False, suffix='.mif') as temp:

153 temp.write(mif)

154 name_mif = temp.name

155 name_mid = temp.name.replace(".mif", ".mid")

156 with open(name_mid, "wb") as f:

157 f.write(mid)

158 gdf = GeoDataFrame.from_file(name_mid, **kwargs)

159 if os.path.exists(name_mid):

160 os.remove(name_mid)

161 if os.path.exists(name_mif):

162 os.remove(name_mif)

163 return gdf

164

165

166def load_shapes_from_zip(filename):

167 """

168 Loads a *.mif* and a *.mid* file compressed into a zip file.

169 It only takes the first *.mid* and *.mif* files from the zip.

170

171 @param filename zip file

172 @return dataframe

173 """

174 with zipfile.ZipFile(filename) as myzip:

175 names0 = myzip.infolist()

176 names = [_.filename for _ in names0 if _.filename.endswith(".mif")]

177 if len(names) == 0:

178 raise FileNotFoundError( # pragma: no cover

179 "No mif file in '{0}'".format(filename))

180 with myzip.open(names[0], "r") as f:

181 mif = f.read()

182 names = [_.filename for _ in names0 if _.filename.endswith(".mid")]

183 if len(names) == 0:

184 raise FileNotFoundError( # pragma: no cover

185 "No mid file in '{0}'".format(filename))

186 with myzip.open(names[0], "r") as f:

187 mid = f.read()

188

189 data = _read_geopandas_from_bytes(mif, mid)

190 return data

191

192

193def load_carreau_from_zip(file_car=None, file_rect=None):

194 """

195 Retourne un exemple de données carroyées.

196 Les données sont disponibles dans le répertoire

197 `data <https://github.com/sdpython/papierstat/tree/master/

198 src/papierstat/datasets/data>`_.

199 Notebooks associés à ce jeu de données :

200

201 .. runpython::

202 :rst:

203

204 from papierstat.datasets.documentation import list_notebooks_rst_links

205 links = list_notebooks_rst_links('visualisation', 'carte_carreau')

206 links = [' * %s' % s for s in links]

207 print('\\n'.join(links))

208

209 @param file_car les carreaux

210 @param file_rect les données

211 @return 4 dataframes

212

213 Résultats:

214

215 * données sur la population par carreaux

216 * shapefiles des carreaux

217 * données sur la population par rectangles

218 * shapefiles des rectangles

219

220 .. note::

221

222 Afin de respecter la règle de diffusion des données sur les

223 revenus fiscaux des ménages, aucune information

224 statistique (à l'exception du nombre total d'individus) n'est

225 diffusée sur des carreaux de moins de 11 ménages. Ces carreaux de

226 faibles effectifs sont donc regroupés en rectangles de taille plus

227 importante et satisfaisant à cette règle des 11 ménages minimum.

228 `source : INSEE <https://www.insee.fr/fr/statistiques/2520034>`_.

229 """

230 if file_rect is None and file_car is None:

231 data = get_data_folder()

232 file_rect = os.path.join(data, 'reunion_rect.zip')

233 file_car = os.path.join(data, 'reunion.zip')

234 dfcar = load_dbf_from_zip(file_car)

235 shpcar = load_shapes_from_zip(file_car)

236 dfrect = load_dbf_from_zip(file_rect)

237 shprect = load_shapes_from_zip(file_rect)

238 return dfcar, shpcar, dfrect, shprect

Coverage for src/papierstat/datasets/carreau.py : 95%

108 statements

Coverage for src/papierstat/datasets/carreau.py : 95%

108 statements 103 run 5 missing 5 excluded

108 statements