Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Jeux de données reliés aux données carroyées. 

5""" 

6import os 

7import zipfile 

8import collections 

9import datetime 

10import tempfile 

11from io import BytesIO 

12from dbfread.field_parser import FieldParser 

13from dbfread import DBF 

14from dbfread.dbf import expand_year 

15import pandas 

16from .data_helper import get_data_folder 

17 

18 

19class DBFInMemory(DBF): 

20 """ 

21 Overwrites `DBF <https://github.com/olemb/dbfread/blob/master/dbfread/dbf.py#L77>`_ 

22 to read data from memory and not from a file. The object 

23 `DBF <http://dbfread.readthedocs.io/en/latest/dbf_objects.html>`_ 

24 needs a file by default. This class avoids creating an intermediate 

25 file when the data is compressed in a :epkg:`zip` file. 

26 """ 

27 

28 def __init__(self, filename, encoding=None, ignorecase=True, 

29 lowernames=False, parserclass=FieldParser, 

30 recfactory=collections.OrderedDict, 

31 load=False, raw=False, ignore_missing_memofile=False, 

32 char_decode_errors='strict'): 

33 

34 if isinstance(filename, str): 

35 DBF.__init__(self, filename, encoding=encoding, ignorecase=ignorecase, 

36 lowernames=lowernames, parserclass=parserclass, 

37 recfactory=recfactory, load=load, 

38 raw=raw, ignore_missing_memofile=ignore_missing_memofile, 

39 char_decode_errors=char_decode_errors) 

40 else: 

41 self.encoding = encoding 

42 self.ignorecase = ignorecase 

43 self.lowernames = lowernames 

44 self.parserclass = parserclass 

45 self.raw = raw 

46 self.ignore_missing_memofile = ignore_missing_memofile 

47 self.char_decode_errors = char_decode_errors 

48 

49 if recfactory is None: 

50 self.recfactory = lambda items: items 

51 else: 

52 self.recfactory = recfactory 

53 

54 self.name = None 

55 self.filename = None 

56 self.content = filename 

57 

58 self._records = None 

59 self._deleted = None 

60 

61 # Filled in by self._read_headers() 

62 self.memofilename = None 

63 self.header = None 

64 self.fields = [] # namedtuples 

65 self.field_names = [] # strings 

66 

67 obj = BytesIO(filename) 

68 self._read_header(obj) 

69 self._read_field_headers(obj) 

70 self._check_headers() 

71 

72 try: 

73 self.date = datetime.date(expand_year(self.header.year), 

74 self.header.month, self.header.day) 

75 except ValueError: # pragma: no cover 

76 # Invalid date or '\x00\x00\x00'. 

77 self.date = None 

78 

79 self.memofilename = self._get_memofilename() 

80 

81 if load: 

82 self.load() 

83 

84 def _iter_records(self, record_type=b' '): 

85 infile = BytesIO(self.content) 

86 with self._open_memofile() as memofile: 

87 

88 # Skip to first record. 

89 infile.seek(self.header.headerlen, 0) 

90 

91 if not self.raw: 

92 field_parser = self.parserclass(self, memofile) 

93 parse = field_parser.parse 

94 

95 # Shortcuts for speed. 

96 skip_record = self._skip_record 

97 read = infile.read 

98 

99 while True: 

100 sep = read(1) 

101 

102 if sep == record_type: 

103 if self.raw: 

104 items = [(field.name, read(field.length)) 

105 for field in self.fields] 

106 else: 

107 items = [(field.name, 

108 parse(field, read(field.length))) 

109 for field in self.fields] 

110 

111 yield self.recfactory(items) 

112 

113 elif sep in (b'\x1a', b''): 

114 # End of records. 

115 break 

116 else: 

117 skip_record(infile) 

118 

119 

120def load_dbf_from_zip(filename): 

121 """ 

122 Loads a *.dbf* file compressed into a zip file. 

123 It only takes the first *.dbf* file from the zip. 

124 

125 @param filename zip file 

126 @return dataframe 

127 """ 

128 with zipfile.ZipFile(filename) as myzip: 

129 names0 = myzip.infolist() 

130 names = [_.filename for _ in names0 if _.filename.endswith(".dbf")] 

131 if len(names) == 0: 

132 raise FileNotFoundError( # pragma: no cover 

133 "No dbf file in '{0}'".format(filename)) 

134 with myzip.open(names[0], "r") as f: 

135 content = f.read() 

136 data = list(DBFInMemory(content)) 

137 return pandas.DataFrame(data) 

138 

139 

140def _read_geopandas_from_bytes(mif, mid, **kwargs): 

141 """ 

142 Returns a :epkg:`GeoDataFrame` from two sequences of bytes, 

143 one for file *.mif*, one from file *.mid*. 

144 Unfortunately, :epkg:`geopandas` does not read from 

145 a buffer, and :epkg:`fiona` does it after writing 

146 in a virtual file (not clear if it is a temporary file or not). 

147 """ 

148 # Delayed import because the import fails sometimes 

149 # on Windows. 

150 from geopandas import GeoDataFrame 

151 

152 with tempfile.NamedTemporaryFile(mode='w+b', delete=False, suffix='.mif') as temp: 

153 temp.write(mif) 

154 name_mif = temp.name 

155 name_mid = temp.name.replace(".mif", ".mid") 

156 with open(name_mid, "wb") as f: 

157 f.write(mid) 

158 gdf = GeoDataFrame.from_file(name_mid, **kwargs) 

159 if os.path.exists(name_mid): 

160 os.remove(name_mid) 

161 if os.path.exists(name_mif): 

162 os.remove(name_mif) 

163 return gdf 

164 

165 

166def load_shapes_from_zip(filename): 

167 """ 

168 Loads a *.mif* and a *.mid* file compressed into a zip file. 

169 It only takes the first *.mid* and *.mif* files from the zip. 

170 

171 @param filename zip file 

172 @return dataframe 

173 """ 

174 with zipfile.ZipFile(filename) as myzip: 

175 names0 = myzip.infolist() 

176 names = [_.filename for _ in names0 if _.filename.endswith(".mif")] 

177 if len(names) == 0: 

178 raise FileNotFoundError( # pragma: no cover 

179 "No mif file in '{0}'".format(filename)) 

180 with myzip.open(names[0], "r") as f: 

181 mif = f.read() 

182 names = [_.filename for _ in names0 if _.filename.endswith(".mid")] 

183 if len(names) == 0: 

184 raise FileNotFoundError( # pragma: no cover 

185 "No mid file in '{0}'".format(filename)) 

186 with myzip.open(names[0], "r") as f: 

187 mid = f.read() 

188 

189 data = _read_geopandas_from_bytes(mif, mid) 

190 return data 

191 

192 

193def load_carreau_from_zip(file_car=None, file_rect=None): 

194 """ 

195 Retourne un exemple de données carroyées. 

196 Les données sont disponibles dans le répertoire 

197 `data <https://github.com/sdpython/papierstat/tree/master/ 

198 src/papierstat/datasets/data>`_. 

199 Notebooks associés à ce jeu de données : 

200 

201 .. runpython:: 

202 :rst: 

203 

204 from papierstat.datasets.documentation import list_notebooks_rst_links 

205 links = list_notebooks_rst_links('visualisation', 'carte_carreau') 

206 links = [' * %s' % s for s in links] 

207 print('\\n'.join(links)) 

208 

209 @param file_car les carreaux 

210 @param file_rect les données 

211 @return 4 dataframes 

212 

213 Résultats: 

214 

215 * données sur la population par carreaux 

216 * shapefiles des carreaux 

217 * données sur la population par rectangles 

218 * shapefiles des rectangles 

219 

220 .. note:: 

221 

222 Afin de respecter la règle de diffusion des données sur les 

223 revenus fiscaux des ménages, aucune information 

224 statistique (à l'exception du nombre total d'individus) n'est 

225 diffusée sur des carreaux de moins de 11 ménages. Ces carreaux de 

226 faibles effectifs sont donc regroupés en rectangles de taille plus 

227 importante et satisfaisant à cette règle des 11 ménages minimum. 

228 `source : INSEE <https://www.insee.fr/fr/statistiques/2520034>`_. 

229 """ 

230 if file_rect is None and file_car is None: 

231 data = get_data_folder() 

232 file_rect = os.path.join(data, 'reunion_rect.zip') 

233 file_car = os.path.join(data, 'reunion.zip') 

234 dfcar = load_dbf_from_zip(file_car) 

235 shpcar = load_shapes_from_zip(file_car) 

236 dfrect = load_dbf_from_zip(file_rect) 

237 shprect = load_shapes_from_zip(file_rect) 

238 return dfcar, shpcar, dfrect, shprect