Coverage for acspsuedo / source / cache.py: 89%
142 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-11 16:02 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-11 16:02 +0000
1"""
2Cache handler for variable/table metadata.
3"""
4import typing as t
5from collections import defaultdict, ChainMap
6from logging import getLogger
7from warnings import warn
9import pandas as pd
11from acspsuedo.source.low.exceptions import APIException
12from acspsuedo.source.low.var_fetch import VariableFetchMixin
15logger = getLogger(__name__)
18class MetadataException(APIException):
19 """Custom exception class for metadata issues."""
20 pass
23class MetadataWarning(UserWarning):
24 """Custom warning class for metadata issues."""
25 pass
29class VariableCache:
30 """
31 Class for caching metadata on variables and tables in American
32 Community Survey datasets.
33 """
34 __CACHE_VARIABLES_BY_DATASET_YEAR: t.DefaultDict[
35 str, t.DefaultDict[int, dict[str, t.Any]]
36 ] = defaultdict(lambda: defaultdict(dict))
38 __CACHE_TABLES_BY_DATASET_YEAR: t.DefaultDict[
39 str, t.DefaultDict[int, dict[str, dict]]
40 ] = defaultdict(lambda: defaultdict(dict))
43 def __init__(self, cache_metadata_dfs: bool = True) -> None:
44 """
45 Initialization for :py:class:`VariableCache`.
47 Parameters
48 ----------
49 cache_metadata_dfs
50 Boolean; default True.
52 Indicate whether or not to locally cache any created instances
53 of the :py:class:`pandas.DataFrame` for querying metadata.
54 """
56 self._var_fetch = VariableFetchMixin()
57 self._cache_metadata_dfs = cache_metadata_dfs
59 if self._cache_metadata_dfs:
60 self._CACHE_VAR_DF_BY_DATASET_YEAR: t.DefaultDict[
61 str, dict[int, pd.DataFrame]
62 ] = defaultdict(dict)
64 @property
65 def cache_metadata_dfs(self):
66 """
67 The state of locally caching any created :py:class:`pandas.DataFrame`
68 instances containing variable metadata. Default True.
69 """
70 return self._cache_metadata_dfs
72 @cache_metadata_dfs.setter
73 def cache_metadata_dfs(self, new_cache_state: bool):
74 self._cache_metadata_dfs = new_cache_state
77 def get_table(self, dataset: str, year: int, table: str):
78 """
79 Get metadata information for all variables in a table.
81 Parameters
82 ----------
83 dataset
84 The dataset of interest.
86 year
87 A calendar year of interest. Note that the dataset must be available
88 for this year.
90 table
91 A supported table within the dataset of interest.
92 """
93 try:
94 metadata = VariableCache.__CACHE_TABLES_BY_DATASET_YEAR[dataset][year][table]
95 except:
96 metadata = self._fetch_table(dataset, year, table)[table]
97 logger.debug(
98 "Did not find table '%s' for the '%s' dataset during the %s calendar year in cache. "
99 "Fetched and cached for potential re-use later", table, dataset, year
100 )
102 return metadata
105 def get_variable(self, dataset: str, year: int, variable: str):
106 """
107 Get metadata information for a particular variable.
109 Parameters
110 ----------
111 dataset
112 The dataset of interest.
114 year
115 A calendar year of interest. Note that the dataset must be available
116 for this year.
118 variable
119 A supported variable within the dataset of interest.
120 """
121 try:
122 metadata = VariableCache.__CACHE_VARIABLES_BY_DATASET_YEAR[dataset][year][variable]
123 except:
124 try:
125 # Load everything and see if it exists
126 metadata = self._fetch_variable(dataset, year, None)[variable]
127 except:
128 # Fall back to here
129 # This is usually in the case of users wishing to specify idiosyncratic annotation
130 # attribute data, since they aren't in the full collection of variables.
131 metadata = self._fetch_variable(dataset, year, variable)[variable]
132 logger.debug(
133 "Did not find the '%s' variable for the '%s' dataset during the %s calendar year in cache. "
134 "Fetched and cached for potential re-use later", variable, dataset, year
135 )
137 return metadata
139 def _fetch_table(self, dataset: str, year: int, table: str):
140 """
141 Fetch a table from the dataset for the specified calendar year.
142 """
143 json_content = self._var_fetch._fetch_table_json_content(dataset, year, table)
145 all_variables = json_content.get("variables", None)
146 if not all_variables:
147 warn(
148 f"\nThe '{table}' table has no known metadata for the '{dataset}' dataset \n"\
149 f"during the {year} calendar year.",
150 MetadataWarning
151 )
152 return
154 tbl = {table: all_variables}
156 # We always want to cache
157 VariableCache.__CACHE_TABLES_BY_DATASET_YEAR[dataset][year][table] = all_variables
159 for var_name, var_info in all_variables.items():
160 VariableCache.__CACHE_VARIABLES_BY_DATASET_YEAR[dataset][year][var_name] = var_info
162 return tbl
164 def _fetch_all_tables(self, dataset: str, year: int):
165 """
166 Fetch all tables from the dataset for the specified calendar year.
168 Parameters
169 ----------
170 dataset
171 The dataset of interest.
173 year
174 A calendar year of interest. Note that the dataset must be available
175 for this year.
176 """
177 json_content = self._var_fetch._fetch_table_json_content(dataset, year)
179 # Fail-fast for empty datasets.
180 all_grps = json_content.get("groups", None)
181 if all_grps is None:
182 raise APIException(
183 f"Found no tables for the '{dataset}' during the calendar year {year}."
184 )
186 grp_names = [grp.get("name") for grp in all_grps]
188 # Fast approach
189 # As opposed to querying each individual table from the API,
190 # we get all variables directly and sort by group accordingly.
191 content = self._fetch_variable(dataset, year)
193 grps_dict = defaultdict(dict)
194 for var_name, var_info in content.items():
195 grp_name = var_info.get('group')
196 if grp_name in grp_names:
197 grps_dict[grp_name].update({var_name: var_info})
199 # We always want to cache
200 for tbl_name, tbl_info in grps_dict.items():
201 VariableCache.__CACHE_TABLES_BY_DATASET_YEAR[dataset][year][tbl_name] = tbl_info
203 return grps_dict
206 def _fetch_variable(self, dataset: str, year: int, variable: t.Optional[str] = None):
207 """
208 Fetch a single variable, or all variables, from a dataset.
209 """
210 json_content = self._var_fetch._fetch_json_content(dataset, year, variable)
212 all_variables = json_content.get("variables", None)
214 if all_variables:
215 # Drop 'for', 'in', 'ucgid'
216 metadata_dict = { k:v for k, v in all_variables.items()
217 if k not in ["for", "in", "ucgid"] }
219 else:
220 name = json_content.pop('name')
221 metadata_dict = {name: json_content}
223 # We always cache
224 for var_name, var_info in metadata_dict.items():
225 VariableCache.__CACHE_VARIABLES_BY_DATASET_YEAR[dataset][year][var_name] = var_info
227 return metadata_dict
229 def var_metadata_df(self, dataset: str, year: int):
230 """
231 Return a :py:class:`pandas.DataFrame` containing holistic metadata for all
232 variables in a dataset for a given year.
234 Parameters
235 ----------
236 dataset
237 The dataset of interest.
239 year
240 A calendar year of interest. Note that the dataset must be available
241 for this year.
242 """
243 if self._cache_metadata_dfs:
244 try:
245 df = self._CACHE_VAR_DF_BY_DATASET_YEAR[dataset][year]
246 return df
247 except:
248 pass
250 json_content = self._fetch_variable(dataset, year)
252 df = pd.DataFrame([
253 {
254 'DATASET': dataset,
255 'YEAR': year,
256 'VARIABLE': var_name,
257 'LABEL': var_info.get('label'),
258 'VARIABLE_TYPE': var_info.get('predicateType', 'string'),
259 'TABLE': var_info.get('group'),
260 'TOPIC': var_info.get('concept', '').title(),
261 }
262 for var_name, var_info in sorted(json_content.items())
263 ])
265 if self._cache_metadata_dfs:
266 self._CACHE_VAR_DF_BY_DATASET_YEAR[dataset][year] = df
268 return df
270 def tbl_metadata_df(self, dataset: str, year: int, table: str):
271 """
272 Return a :py:class:`pandas.DataFrame` containing holistic metadata for
273 all variables in a dataset table for a given year.
275 Parameters
276 ----------
277 dataset
278 The dataset of interest.
280 year
281 A calendar year of interest. Note that the dataset must be available
282 for this year.
284 table
285 The table of interest. Note that the dataset must contain this table.
286 """
287 tbl_vars = self.get_table(dataset, year, table)
289 if tbl_vars:
290 for i in ['GEO_ID', 'NAME']:
291 tbl_vars.pop(i, None)
292 df = pd.DataFrame([
293 {
294 'DATASET': dataset,
295 'YEAR': year,
296 'VARIABLE': var_name,
297 'LABEL': var_info.get('label'),
298 'VARIABLE_TYPE': var_info.get('predicateType', 'string'),
299 'TABLE': var_info.get('group'),
300 'TOPIC': var_info.get('concept', '').title(),
301 }
302 for var_name, var_info in sorted(tbl_vars.items())
303 ])
305 return df
307 raise APIException(
308 f"The '{table}' table was empty and/or non-existent for the '{dataset}' dataset "
309 f"for the {year} calendar year."
310 )
313 def _vars_metadata(
314 self,
315 dataset: str,
316 year: int,
317 vars: t.Optional[t.Union[t.List[str], str]] = None,
318 tbls: t.Optional[t.Union[t.List[str], str]] = None,
319 drop_annotation_vars: bool = True
320 ) -> t.Tuple[t.List[t.Any], t.Dict[t.Any, t.Any]]:
321 """
322 Create a list of variables from the supplied variable(s) and table(s), as well as
323 a metadata dictionary containing the data types for each variable. The upside of
324 this particular approach is to provide a simpler end-consumption for querying
325 multiple variables of interest, especially if some share table location, and to
326 provide metadata on each variable's typing.
328 Parameters
329 ----------
330 dataset
331 A dataset of interest
333 year
334 A calendar year of interest. Dataset must be supported for this year.
336 vars
337 One, none, or multiple variables. Default None.
339 tbls
340 One, none, or multiple tables. Default None.
342 drop_annotation_vars
343 Boolean; default True. Indicate whether or not to drop supplementary
344 attribute/annotation/margin-of-error variables.
346 Returns
347 -------
348 A tuple containing:
349 - All variables from the specification provided
350 - Each of the variables corresponding data types.
351 """
353 if not isinstance(vars, list):
354 vars = [vars]
355 if not isinstance(tbls, list):
356 tbls = [tbls]
358 vars = [var for var in vars if var is not None]
359 tbls = [tbl for tbl in tbls if tbl is not None]
361 metadata = []
363 if vars:
364 for var in vars:
365 try:
366 var_dict = self.get_variable(dataset, year, var)
367 metadata.append({var: var_dict})
368 except APIException:
369 raise MetadataException(
370 f"The '{var}' variable was not recognized for the '{dataset}' dataset.",
371 ) from None
372 if tbls:
373 for tbl in tbls:
374 try:
375 tbl_dict = self.get_table(dataset, year, tbl)
376 metadata.append(tbl_dict)
377 except APIException:
378 raise MetadataException(
379 f"The '{tbl}' table was not recognized for the '{dataset}' dataset.",
380 ) from None
382 # Parse out annotation and MOE variables, if indicated
383 metadata = dict(ChainMap(*metadata))
384 if drop_annotation_vars:
385 metadata = {k: v for k,v in metadata.items() if not
386 any(x in v.get('label', '') for x in ['Annotation', 'Margin of Error'])}
388 if not metadata:
389 raise APIException(
390 "Non-existent variables; the specified tables and/or variables do not "
391 f"exist for the '{dataset}' dataset during the {year} calendar year."
392 )
394 meta_dict = {k: v.get('predicateType', '') for k, v in metadata.items()}
395 all_vars = list(meta_dict.keys())
397 return all_vars, meta_dict
400 def _set_dtypes(
401 self,
402 data_df: pd.DataFrame,
403 meta_dict: t.Dict[t.Any, t.Any]
404 ):
405 """
406 Convert the data types for the fetched census data.
408 Parameters
409 ----------
410 data_df
411 The fetched data from the Census Bureau.
413 meta_dict
414 A dictionary specifiying dtypes for the fetched data.
415 """
416 # Type conversion
417 for var, var_type in meta_dict.items():
419 if var_type == 'int':
420 try:
421 data_df[var] = data_df[var].astype(int)
422 except:
423 data_df[var] = data_df[var].astype(float)
425 elif var_type == 'float':
426 data_df[var] = data_df[var].astype(float)
428 elif var_type == 'string':
429 data_df[var] = data_df[var].astype(str)
431 # Last ditch attempt
432 else:
433 data_df[var] = data_df[var].astype(object)
435 return data_df