Coverage for acspsuedo / source / cache.py: 89%

142 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-11 16:02 +0000

1""" 

2Cache handler for variable/table metadata. 

3""" 

4import typing as t 

5from collections import defaultdict, ChainMap 

6from logging import getLogger 

7from warnings import warn 

8 

9import pandas as pd 

10 

11from acspsuedo.source.low.exceptions import APIException 

12from acspsuedo.source.low.var_fetch import VariableFetchMixin 

13 

14 

15logger = getLogger(__name__) 

16 

17 

18class MetadataException(APIException): 

19 """Custom exception class for metadata issues.""" 

20 pass 

21 

22 

23class MetadataWarning(UserWarning): 

24 """Custom warning class for metadata issues.""" 

25 pass 

26 

27 

28 

29class VariableCache: 

30 """ 

31 Class for caching metadata on variables and tables in American 

32 Community Survey datasets. 

33 """ 

34 __CACHE_VARIABLES_BY_DATASET_YEAR: t.DefaultDict[ 

35 str, t.DefaultDict[int, dict[str, t.Any]] 

36 ] = defaultdict(lambda: defaultdict(dict)) 

37 

38 __CACHE_TABLES_BY_DATASET_YEAR: t.DefaultDict[ 

39 str, t.DefaultDict[int, dict[str, dict]] 

40 ] = defaultdict(lambda: defaultdict(dict)) 

41 

42 

43 def __init__(self, cache_metadata_dfs: bool = True) -> None: 

44 """ 

45 Initialization for :py:class:`VariableCache`. 

46 

47 Parameters 

48 ---------- 

49 cache_metadata_dfs 

50 Boolean; default True. 

51  

52 Indicate whether or not to locally cache any created instances 

53 of the :py:class:`pandas.DataFrame` for querying metadata. 

54 """ 

55 

56 self._var_fetch = VariableFetchMixin() 

57 self._cache_metadata_dfs = cache_metadata_dfs 

58 

59 if self._cache_metadata_dfs: 

60 self._CACHE_VAR_DF_BY_DATASET_YEAR: t.DefaultDict[ 

61 str, dict[int, pd.DataFrame] 

62 ] = defaultdict(dict) 

63 

64 @property 

65 def cache_metadata_dfs(self): 

66 """ 

67 The state of locally caching any created :py:class:`pandas.DataFrame` 

68 instances containing variable metadata. Default True. 

69 """ 

70 return self._cache_metadata_dfs 

71 

72 @cache_metadata_dfs.setter 

73 def cache_metadata_dfs(self, new_cache_state: bool): 

74 self._cache_metadata_dfs = new_cache_state 

75 

76 

77 def get_table(self, dataset: str, year: int, table: str): 

78 """ 

79 Get metadata information for all variables in a table. 

80 

81 Parameters 

82 ---------- 

83 dataset 

84 The dataset of interest. 

85 

86 year 

87 A calendar year of interest. Note that the dataset must be available 

88 for this year. 

89 

90 table 

91 A supported table within the dataset of interest. 

92 """ 

93 try: 

94 metadata = VariableCache.__CACHE_TABLES_BY_DATASET_YEAR[dataset][year][table] 

95 except: 

96 metadata = self._fetch_table(dataset, year, table)[table] 

97 logger.debug( 

98 "Did not find table '%s' for the '%s' dataset during the %s calendar year in cache. " 

99 "Fetched and cached for potential re-use later", table, dataset, year 

100 ) 

101 

102 return metadata 

103 

104 

105 def get_variable(self, dataset: str, year: int, variable: str): 

106 """ 

107 Get metadata information for a particular variable. 

108 

109 Parameters 

110 ---------- 

111 dataset 

112 The dataset of interest. 

113 

114 year 

115 A calendar year of interest. Note that the dataset must be available 

116 for this year. 

117 

118 variable 

119 A supported variable within the dataset of interest. 

120 """ 

121 try: 

122 metadata = VariableCache.__CACHE_VARIABLES_BY_DATASET_YEAR[dataset][year][variable] 

123 except: 

124 try: 

125 # Load everything and see if it exists 

126 metadata = self._fetch_variable(dataset, year, None)[variable] 

127 except: 

128 # Fall back to here 

129 # This is usually in the case of users wishing to specify idiosyncratic annotation 

130 # attribute data, since they aren't in the full collection of variables. 

131 metadata = self._fetch_variable(dataset, year, variable)[variable] 

132 logger.debug( 

133 "Did not find the '%s' variable for the '%s' dataset during the %s calendar year in cache. " 

134 "Fetched and cached for potential re-use later", variable, dataset, year 

135 ) 

136 

137 return metadata 

138 

139 def _fetch_table(self, dataset: str, year: int, table: str): 

140 """ 

141 Fetch a table from the dataset for the specified calendar year. 

142 """ 

143 json_content = self._var_fetch._fetch_table_json_content(dataset, year, table) 

144 

145 all_variables = json_content.get("variables", None) 

146 if not all_variables: 

147 warn( 

148 f"\nThe '{table}' table has no known metadata for the '{dataset}' dataset \n"\ 

149 f"during the {year} calendar year.", 

150 MetadataWarning 

151 ) 

152 return 

153 

154 tbl = {table: all_variables} 

155 

156 # We always want to cache 

157 VariableCache.__CACHE_TABLES_BY_DATASET_YEAR[dataset][year][table] = all_variables 

158 

159 for var_name, var_info in all_variables.items(): 

160 VariableCache.__CACHE_VARIABLES_BY_DATASET_YEAR[dataset][year][var_name] = var_info 

161 

162 return tbl 

163 

164 def _fetch_all_tables(self, dataset: str, year: int): 

165 """ 

166 Fetch all tables from the dataset for the specified calendar year. 

167  

168 Parameters 

169 ---------- 

170 dataset 

171 The dataset of interest. 

172 

173 year 

174 A calendar year of interest. Note that the dataset must be available 

175 for this year. 

176 """ 

177 json_content = self._var_fetch._fetch_table_json_content(dataset, year) 

178 

179 # Fail-fast for empty datasets. 

180 all_grps = json_content.get("groups", None) 

181 if all_grps is None: 

182 raise APIException( 

183 f"Found no tables for the '{dataset}' during the calendar year {year}." 

184 ) 

185 

186 grp_names = [grp.get("name") for grp in all_grps] 

187 

188 # Fast approach 

189 # As opposed to querying each individual table from the API, 

190 # we get all variables directly and sort by group accordingly. 

191 content = self._fetch_variable(dataset, year) 

192 

193 grps_dict = defaultdict(dict) 

194 for var_name, var_info in content.items(): 

195 grp_name = var_info.get('group') 

196 if grp_name in grp_names: 

197 grps_dict[grp_name].update({var_name: var_info}) 

198 

199 # We always want to cache 

200 for tbl_name, tbl_info in grps_dict.items(): 

201 VariableCache.__CACHE_TABLES_BY_DATASET_YEAR[dataset][year][tbl_name] = tbl_info 

202 

203 return grps_dict 

204 

205 

206 def _fetch_variable(self, dataset: str, year: int, variable: t.Optional[str] = None): 

207 """ 

208 Fetch a single variable, or all variables, from a dataset. 

209 """ 

210 json_content = self._var_fetch._fetch_json_content(dataset, year, variable) 

211 

212 all_variables = json_content.get("variables", None) 

213 

214 if all_variables: 

215 # Drop 'for', 'in', 'ucgid' 

216 metadata_dict = { k:v for k, v in all_variables.items() 

217 if k not in ["for", "in", "ucgid"] } 

218 

219 else: 

220 name = json_content.pop('name') 

221 metadata_dict = {name: json_content} 

222 

223 # We always cache 

224 for var_name, var_info in metadata_dict.items(): 

225 VariableCache.__CACHE_VARIABLES_BY_DATASET_YEAR[dataset][year][var_name] = var_info 

226 

227 return metadata_dict 

228 

229 def var_metadata_df(self, dataset: str, year: int): 

230 """ 

231 Return a :py:class:`pandas.DataFrame` containing holistic metadata for all 

232 variables in a dataset for a given year. 

233 

234 Parameters 

235 ---------- 

236 dataset 

237 The dataset of interest. 

238 

239 year 

240 A calendar year of interest. Note that the dataset must be available 

241 for this year. 

242 """ 

243 if self._cache_metadata_dfs: 

244 try: 

245 df = self._CACHE_VAR_DF_BY_DATASET_YEAR[dataset][year] 

246 return df 

247 except: 

248 pass 

249 

250 json_content = self._fetch_variable(dataset, year) 

251 

252 df = pd.DataFrame([ 

253 { 

254 'DATASET': dataset, 

255 'YEAR': year, 

256 'VARIABLE': var_name, 

257 'LABEL': var_info.get('label'), 

258 'VARIABLE_TYPE': var_info.get('predicateType', 'string'), 

259 'TABLE': var_info.get('group'), 

260 'TOPIC': var_info.get('concept', '').title(), 

261 } 

262 for var_name, var_info in sorted(json_content.items()) 

263 ]) 

264 

265 if self._cache_metadata_dfs: 

266 self._CACHE_VAR_DF_BY_DATASET_YEAR[dataset][year] = df 

267 

268 return df 

269 

270 def tbl_metadata_df(self, dataset: str, year: int, table: str): 

271 """ 

272 Return a :py:class:`pandas.DataFrame` containing holistic metadata for 

273 all variables in a dataset table for a given year. 

274 

275 Parameters 

276 ---------- 

277 dataset 

278 The dataset of interest. 

279 

280 year 

281 A calendar year of interest. Note that the dataset must be available 

282 for this year. 

283 

284 table 

285 The table of interest. Note that the dataset must contain this table. 

286 """ 

287 tbl_vars = self.get_table(dataset, year, table) 

288 

289 if tbl_vars: 

290 for i in ['GEO_ID', 'NAME']: 

291 tbl_vars.pop(i, None) 

292 df = pd.DataFrame([ 

293 { 

294 'DATASET': dataset, 

295 'YEAR': year, 

296 'VARIABLE': var_name, 

297 'LABEL': var_info.get('label'), 

298 'VARIABLE_TYPE': var_info.get('predicateType', 'string'), 

299 'TABLE': var_info.get('group'), 

300 'TOPIC': var_info.get('concept', '').title(), 

301 } 

302 for var_name, var_info in sorted(tbl_vars.items()) 

303 ]) 

304 

305 return df 

306 

307 raise APIException( 

308 f"The '{table}' table was empty and/or non-existent for the '{dataset}' dataset " 

309 f"for the {year} calendar year." 

310 ) 

311 

312 

313 def _vars_metadata( 

314 self, 

315 dataset: str, 

316 year: int, 

317 vars: t.Optional[t.Union[t.List[str], str]] = None, 

318 tbls: t.Optional[t.Union[t.List[str], str]] = None, 

319 drop_annotation_vars: bool = True 

320 ) -> t.Tuple[t.List[t.Any], t.Dict[t.Any, t.Any]]: 

321 """ 

322 Create a list of variables from the supplied variable(s) and table(s), as well as 

323 a metadata dictionary containing the data types for each variable. The upside of 

324 this particular approach is to provide a simpler end-consumption for querying 

325 multiple variables of interest, especially if some share table location, and to 

326 provide metadata on each variable's typing. 

327 

328 Parameters 

329 ---------- 

330 dataset 

331 A dataset of interest 

332 

333 year 

334 A calendar year of interest. Dataset must be supported for this year. 

335 

336 vars 

337 One, none, or multiple variables. Default None. 

338 

339 tbls 

340 One, none, or multiple tables. Default None. 

341 

342 drop_annotation_vars 

343 Boolean; default True. Indicate whether or not to drop supplementary 

344 attribute/annotation/margin-of-error variables. 

345 

346 Returns 

347 ------- 

348 A tuple containing: 

349 - All variables from the specification provided 

350 - Each of the variables corresponding data types. 

351 """ 

352 

353 if not isinstance(vars, list): 

354 vars = [vars] 

355 if not isinstance(tbls, list): 

356 tbls = [tbls] 

357 

358 vars = [var for var in vars if var is not None] 

359 tbls = [tbl for tbl in tbls if tbl is not None] 

360 

361 metadata = [] 

362 

363 if vars: 

364 for var in vars: 

365 try: 

366 var_dict = self.get_variable(dataset, year, var) 

367 metadata.append({var: var_dict}) 

368 except APIException: 

369 raise MetadataException( 

370 f"The '{var}' variable was not recognized for the '{dataset}' dataset.", 

371 ) from None 

372 if tbls: 

373 for tbl in tbls: 

374 try: 

375 tbl_dict = self.get_table(dataset, year, tbl) 

376 metadata.append(tbl_dict) 

377 except APIException: 

378 raise MetadataException( 

379 f"The '{tbl}' table was not recognized for the '{dataset}' dataset.", 

380 ) from None 

381 

382 # Parse out annotation and MOE variables, if indicated 

383 metadata = dict(ChainMap(*metadata)) 

384 if drop_annotation_vars: 

385 metadata = {k: v for k,v in metadata.items() if not 

386 any(x in v.get('label', '') for x in ['Annotation', 'Margin of Error'])} 

387 

388 if not metadata: 

389 raise APIException( 

390 "Non-existent variables; the specified tables and/or variables do not " 

391 f"exist for the '{dataset}' dataset during the {year} calendar year." 

392 ) 

393 

394 meta_dict = {k: v.get('predicateType', '') for k, v in metadata.items()} 

395 all_vars = list(meta_dict.keys()) 

396 

397 return all_vars, meta_dict 

398 

399 

400 def _set_dtypes( 

401 self, 

402 data_df: pd.DataFrame, 

403 meta_dict: t.Dict[t.Any, t.Any] 

404 ): 

405 """ 

406 Convert the data types for the fetched census data. 

407 

408 Parameters 

409 ---------- 

410 data_df 

411 The fetched data from the Census Bureau. 

412 

413 meta_dict 

414 A dictionary specifiying dtypes for the fetched data. 

415 """ 

416 # Type conversion 

417 for var, var_type in meta_dict.items(): 

418 

419 if var_type == 'int': 

420 try: 

421 data_df[var] = data_df[var].astype(int) 

422 except: 

423 data_df[var] = data_df[var].astype(float) 

424 

425 elif var_type == 'float': 

426 data_df[var] = data_df[var].astype(float) 

427 

428 elif var_type == 'string': 

429 data_df[var] = data_df[var].astype(str) 

430 

431 # Last ditch attempt 

432 else: 

433 data_df[var] = data_df[var].astype(object) 

434 

435 return data_df