Coverage for acspsuedo / query.py: 98%
169 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-11 16:02 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-11 16:02 +0000
1"""
2Main-level entry point for downloading Census Bureau data.
4Note that it is recommended to obtain an API key if you are
5making many (500+) queries in a daily session. An API key
6is free to obtain at: https://api.census.gov/data/key_signup.html.
7"""
9import typing as t
10import warnings
12import aiohttp
13import pandas as pd
14import geopandas as gpd
15import numpy as np
17from acspsuedo.fips import STATE_FIPS
18from acspsuedo.source.geog import GeoSpecFmtter, ApiKeyConfig
19from acspsuedo.source.shpfile import ShpfileFormatterException
20from acspsuedo.source.shpfile_fmt import GEO_SPEC_METADATA
21from acspsuedo.source.cache import VariableCache
22from acspsuedo.source.na_values import REPLACEMENT_VALUES
23from acspsuedo.source.low.protocols import fetch_table, batch_fetch_content
24import acspsuedo.source.shpfile
28api_key_config: ApiKeyConfig = ApiKeyConfig()
29"""
30Configuration settings for the API key.
32Note that you can specify the location of your API key through one of three ways:
33- By assigning it to the `API_KEY` attribute (prioritized)
34- By setting it in the operating system environment
35- By writing it in a textfile (`./api_key.txt`) in the working directory.
37You can customize the locations of the last two settings with the respective attributes:
38- `api_key_config.OS_ENV_LOCATION`, for the operating system location
39- `api_key_config.FILE_PATH`, for the file path location
40"""
42variable_cache: VariableCache = VariableCache()
43"""
44Internal source for caching metadata information regarding tables and variables across
45all American Community Survey datasets.
47This is exposed here in case you wish to customize caching preferences and/or view any
48information regarding variables/tables via the methods of this instance.
49"""
51shapefile_handler: acspsuedo.source.shpfile.ShpFileHandler = acspsuedo.source.shpfile.shapefile_handler
52"""
53Internal handler interface for TIGER shapefiles.
55This is exposed here in the scenario that you may want to customize caching preferences.
56There are three such caching preferences that you may customize:
58- `shapefile_handler.auto_cache` (`bool`; default True)
60 Indicate whether or not to automatically cache extracted shapefile
61 information.
63- `shapefile_handler.cache_path` (:py:class:`pathlib.Path` or `string`;
64 default `Path.home() / 'cache' / 'acspsuedo' / 'TIGER_shapefiles'`)
66 If `auto_cache` is True, `cache_path` specifies the caching location
67 of extracted shapefiles.
69- `shapefile_handler.track_updated_cache` (`bool`; default True)
71 Indicate whether or not tracked shapefiles posited in the previous
72 cache location should be moved if/when a new cache location should
73 be specified. For the justification of this part of the handler
74 interface, please check out the `acspsuedo.source.shpfile` module.
75"""
77# Two helper functions, for viewing geographic scopes metadata
78check_path_existence = GeoSpecFmtter.check_path_existence
79view_geographic_paths = GeoSpecFmtter.view_geographic_paths
84def download(
85 dataset: str,
86 year: int,
87 *,
88 variables: t.Optional[t.Union[t.List[str], str]] = None,
89 tables: t.Optional[t.Union[t.List[str], str]] = None,
90 drop_annotation_variables: bool = True,
91 convert_to_na: bool = True,
92 include_geometries: bool = False,
93 with_geometry_id_columns: bool = False,
94 **geographic_specifiers
95) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:
96 """
97 Download data from the United States Census Bureau's American Community Survey's (ACS) for
98 some geographies of interest.
100 Note that you can specify particular variables, tables, or some combination of the two.
102 Parameters
103 ----------
104 dataset
105 A supported ACS dataset.
107 To view the list of supported datasets, as well as their respectively available
108 years, see `acspsuedo.datasets`.
110 year
111 A calendar year for the ACS dataset.
113 Note that this calendar year must be available for the specified ACS dataset
114 of interest.
116 variables
117 A variable, or list of variables, to be queried from the ACS dataset.
119 tables
120 A dataset table, or list of tables, which must be supported by the ACS dataset of interest.
122 drop_annotation_variables
123 The Bureau often attaches supplementary, non-required attribute and margin-of-error information
124 for estimate data. Indicate whether or not to drop this information. Default `True`.
126 convert_to_na
127 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`.
129 include_geometries
130 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER
131 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`.
133 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the
134 non-existence of corresponding geometric information for certain scopes, this may not always
135 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame`
136 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame`
137 containing the former in addition to the respective geometric information.
139 with_geometry_id_columns
140 If `include_geometries` is True, indicate whether or not to append the geometric information
141 with their respective identifier columns. Default `False`.
143 Note: These columns have been made to cohere with those identifier variables/columns requested
144 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this
145 additional identifier information, you can set this setting to `True`.
147 geographic_specifiers
148 A set of geographic specifiers specifying the geographies on which queried data
149 should be restricted to.
151 To view available fully-specified geographic paths for an ACS dataset, reference the
152 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference
153 the `~check_path_existence()` function to see whether or not they are supported for an ACS
154 dataset of interest and, if they are supported, all geographic paths containing those
155 specifiers of interest.
157 Returns
158 -------
159 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest.
161 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return
162 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information.
164 Notes
165 -----
166 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are
167 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key,
168 set the key in your operating system's environment, e.g.,
169 ```
170 import os
171 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here
172 ```
173 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the
174 OS environment key is prioritized.
176 The configuration for the locations of these settings can be customized.
177 ```
178 from acspsuedo.query import api_key_config
179 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key
180 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key
181 ```
182 """
184 urls, meta_d, geographic_specifiers = _fmt_download_url(
185 dataset = dataset,
186 year = year,
187 vars = variables,
188 tbls = tables,
189 drop_annotation_vars = drop_annotation_variables,
190 **geographic_specifiers
191 )
192 df = fetch_table(urls)
193 df = _df_cleaner(df, year, meta_d, convert_to_na, **geographic_specifiers)
195 if include_geometries:
196 df = append_geographic_info(df, year, with_geometry_id_columns, **geographic_specifiers)
198 return df
201async def async_download(
202 session: aiohttp.ClientSession,
203 dataset: str,
204 year: int,
205 *,
206 variables: t.Optional[t.Union[t.List[str], str]] = None,
207 tables: t.Optional[t.Union[t.List[str], str]] = None,
208 drop_annotation_variables: bool = True,
209 convert_to_na: bool = True,
210 retry_rate: int = 30,
211 timeout_rate: t.Union[float, int] = 0.1,
212 include_geometries: bool = False,
213 with_geometry_id_columns: bool = False,
214 **geographic_specifiers
215) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:
216 """
217 Asynchronous implementation for downloading data from the United States Census Bureau's
218 American Community Survey's (ACS) datasets. Execution unit is that of a concurrent model
219 (since fetches are I/O-bound tasks; thread safety ensured).
221 Note that you can specify particular variables, tables, or some combination of the two.
223 Parameters
224 ----------
225 session
226 A(n) :py:class:`aiohttp.ClientSession` interface/context manager.
228 dataset
229 A supported ACS dataset.
231 To view the list of supported datasets, as well as their respectively available
232 years, see `acspsuedo.datasets`.
234 year
235 A calendar year for the ACS dataset.
237 Note that this calendar year must be available for the specified ACS dataset
238 of interest.
240 variables
241 A variable, or list of variables, to be queried from the ACS dataset.
243 tables
244 A dataset table, or list of tables, which must be supported by the ACS dataset of interest.
246 drop_annotation_variables
247 The Bureau often attaches supplementary, non-required attribute and margin-of-error information
248 for estimate data. Indicate whether or not to drop this information. Default `True`.
250 convert_to_na
251 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`.
253 retry_rate
254 In case of server-based blocking, indicate how many attempts should be made per URL before skipping.
255 Default 30.
257 timeout_rate
258 In case of querying large amounts of tables/variables, by how much (in seconds) should each request
259 attempt be delayed by. Default 0.1 seconds.
261 include_geometries
262 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER
263 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`.
265 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the
266 non-existence of corresponding geometric information for certain scopes, this may not always
267 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame`
268 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame`
269 containing the former in addition to the respective geometric information.
271 with_geometry_id_columns
272 If `include_geometries` is True, indicate whether or not to append the geometric information
273 with their respective identifier columns. Default `False`.
275 Note: These columns have been made to cohere with those identifier variables/columns requested
276 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this
277 additional identifier information, you can set this setting to `True`.
279 geographic_specifiers
280 A set of geographic specifiers specifying the geographies on which queried data
281 should be restricted to.
283 To view available fully-specified geographic paths for an ACS dataset, reference the
284 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference
285 the `~check_path_existence()` function to see whether or not they are supported for the ACS
286 dataset of interest and, if they are supported, all geographic paths containing those
287 specifiers of interest.
289 Returns
290 -------
291 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest.
293 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return
294 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information.
296 Notes
297 -----
298 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are
299 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key,
300 set the key in your operating system's environment, e.g.,
301 ```
302 import os
303 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here
304 ```
305 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the
306 OS environment key is prioritized.
308 The configuration for the locations of these settings can be customized.
309 ```
310 from acspsuedo.query import api_key_config
311 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key
312 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key
313 ```
314 """
316 # This does the heavy lifting prior to sending requests. This is the reason
317 # why thread safety is ensured since metadata formatting requires caching, but
318 # because it has been implemented as a synchronous routine, and runs precisely
319 # once to generate the urls/metadata, the cache is not modified as queries are
320 # being fetched.
321 # Moreover, the execution unit is a concurrent model, since fetches are I/O bound,
322 # so the issue of thread safety should absolutely be of no concen.
323 urls, meta_d, geographic_specifiers = _fmt_download_url(
324 dataset = dataset,
325 year = year,
326 vars = variables,
327 tbls = tables,
328 drop_annotation_vars = drop_annotation_variables,
329 **geographic_specifiers
330 )
331 df = await batch_fetch_content(session, urls, retry_rate, timeout_rate)
332 df = _df_cleaner(df, year, meta_d, convert_to_na, **geographic_specifiers)
334 if include_geometries:
335 df = append_geographic_info(df, year, with_geometry_id_columns, **geographic_specifiers)
337 return df
341def _df_cleaner(
342 df: pd.DataFrame,
343 year: int,
344 meta_dict: t.Dict[t.Any, t.Any],
345 drop_na: bool = True,
346 **geo_specifiers
347) -> pd.DataFrame:
348 """
349 Internal dataframe cleaner for cleaning queried Census Bureau data.
351 Parameters
352 ----------
353 meta_dict
354 A dictionary containing data types for each of the queried variables.
356 drop_na
357 Indicate whether or not special values should be replaced with `np.nan` values.
358 Default `True`.
360 geo_specifiers
361 A set of geographic specifiers specifying the geographies on which queried data
362 should be restricted to.
363 """
365 # Create a year column
366 df['YEAR'] = year
368 # Upper case columns
369 df.columns = [col.upper() for col in df.columns]
371 # Move identifier columns to the front
372 geo_col_labs = GeoSpecFmtter.get_geo_cols(**geo_specifiers)
373 id_cols = [col for col in ['NAME', 'GEO_ID', 'UCGID', *geo_col_labs, 'YEAR']
374 if col in list(df.columns)]
375 data_cols = sorted([col for col in list(df.columns) if col not in id_cols])
376 df = df[id_cols + data_cols]
378 # Drop duplicate columns
379 df = df.iloc[:, ~df.columns.duplicated()].copy()
381 # If found, sort by GEO_ID. Else, sort by id columns.
382 if 'GEO_ID' in df.columns:
383 df['GEO_ID'] = [col.split('US', 1)[-1] for col in df['GEO_ID']]
384 df.sort_values(by = 'GEO_ID', ignore_index=True, inplace=True)
385 else:
386 if 'NAME' in id_cols:
387 id_cols.remove('NAME')
388 df.sort_values(by = id_cols, ignore_index=True, inplace=True)
390 # Set column dtypes
391 df = variable_cache._set_dtypes(df, meta_dict)
393 # Drop NA values (if specified; default 'True')
394 if drop_na:
395 df.replace(REPLACEMENT_VALUES, np.nan, inplace = True)
397 return df
402def _fmt_download_url(
403 dataset: str,
404 year: int,
405 vars: t.Optional[t.Union[t.List[str], str]] = None,
406 tbls: t.Optional[t.Union[t.List[str], str]] = None,
407 drop_annotation_vars: bool = True,
408 **geog_specifiers
409) -> t.Tuple[list[str], t.Dict[str, str], t.Dict[str, str]]:
410 """
411 Internal for formatting multiple download links to the Census Bureau
412 (in the potential case that a user may query 50+ variables at once).
413 """
414 url, geog_specifiers = _fmt_url(dataset, year, **geog_specifiers)
415 vars, meta_dict = variable_cache._vars_metadata(dataset, year, vars, tbls, drop_annotation_vars)
417 urls = [url.format(','.join(vars[i:i+50]) ) for i in range(0, len(vars) + 1, 50) ]
419 return urls, meta_dict, geog_specifiers
422def _fmt_url(dataset: str, year: int, **geog_specifiers):
423 """
424 Formatter skeleton for the URLs.
425 """
426 geo_specs, geog_specifiers = GeoSpecFmtter.get_fmt_path(dataset, year, **geog_specifiers)
427 url_fmtter = 'https://api.census.gov/data/{year}/{dataset}?get={var}{geo_specs}{key}'
429 fmt_url = url_fmtter.format(
430 var = '{}',
431 dataset = dataset,
432 year = str(year),
433 geo_specs = geo_specs,
434 key = api_key_config._get_api_key()
435 )
437 return fmt_url, geog_specifiers
443def append_geographic_info(
444 data_df: pd.DataFrame,
445 year: int,
446 with_geometry_columns: bool = False,
447 **geographic_specifiers: t.Any
448) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:
449 """
450 Add geographic data to the queried dataset.
452 Parameters
453 ----------
454 data_df
455 The returned :py:class:`pandas.DataFrame` instance generated from the
456 query.
458 year
459 The calendar year for the queried data.
461 with_geometry_columns
462 Indicate whether or not to attach any
464 geographic_specifiers
465 The set of geographic specifiers that were used from the query.
467 Returns
468 -------
469 A :py:class:`geopandas.GeoDataFrame` instance containing geographic information
470 for each record/row from the fetched Census Bureau data *provided a shapefile
471 is found*. Otherwise, the originally queried data is returned alongside a helpful
472 warning.
473 """
474 gdf = _get_shpfile(data_df, year, **geographic_specifiers)
476 if gdf.empty:
477 # Suggests shapefile non-existence and/or naming convention issues.
478 # Because of our internal handler set-up, a warning is automatically raised.
479 return data_df
481 shpfile_scope = list(geographic_specifiers)[-1]
482 _, merge_df_cols, _, merge_gdf_cols, _ = GEO_SPEC_METADATA[shpfile_scope]
484 if 'GEO_ID' in data_df.columns and 'GEO_ID' in gdf.columns:
485 merge_df_cols = ['GEO_ID', *merge_df_cols]
486 merge_gdf_cols = ['GEO_ID', *merge_gdf_cols]
488 merge_gdf = pd.merge(
489 right = data_df,
490 left = gdf,
491 how = "right",
492 right_on = [*merge_df_cols, 'YEAR'],
493 left_on = [*merge_gdf_cols, 'YEAR'],
494 )
496 # Move the queried data columns to the front
497 df_cols = list(data_df.columns)
498 gdf_cols = [col for col in merge_gdf.columns if col not in [*df_cols, 'geometry']]
500 merge_gdf = merge_gdf[
501 df_cols + (gdf_cols if with_geometry_columns else []) + ['geometry']
502 ]
504 return merge_gdf
508def _get_shpfile(
509 df: pd.DataFrame,
510 year: int,
511 **geographic_specifiers: t.Any
512) -> gpd.GeoDataFrame:
513 """
514 Underlying that makes runs to the shapefile database (if files are not previously
515 cached) with our shapefile handler and formats any shapefiles requiring a 'state'
516 outer point of reference.
517 """
518 try:
519 gdf = shapefile_handler.fetch_tiger_shpfile(year, **geographic_specifiers)
520 if gdf is None:
521 # Indicates our anticipated naming convention issues or shapefile non-existence
522 return gpd.GeoDataFrame()
523 return gdf
525 # Specific handling for shapefiles whose outer point of reference is 'state'
526 except ShpfileFormatterException:
528 # If 'STATE' is found, great. Otherwise, we load all 'state' FIPS codes and
529 # extract the shapefiles from each. The latter typically arises when users
530 # run data queries for certain geographic pathways of length 1 and/or containing
531 # wildcard operators (e.g. {'congressional_district': '*'}).
532 if 'STATE' in df.columns:
533 states = list(df['STATE'].unique())
534 else:
535 states = list(STATE_FIPS.values())
537 # Shapefile scope governed by last specifier
538 shpfile_scope = list(geographic_specifiers)[-1]
540 gdfs = []
541 with warnings.catch_warnings():
542 warnings.simplefilter('ignore')
543 for state in states:
544 gdf = shapefile_handler.fetch_tiger_shpfile(year, **{'state': state, shpfile_scope: '*'})
545 if gdf is not None:
546 gdfs.append(gdf)
548 gdf = pd.concat(gdfs, ignore_index = True)
549 return gdf
553def confined_download(area_threshold: t.Union[int, float] = 0.7, **geographic_specifiers):
554 """
555 Download data from the United States Census Bureau's American Community Survey's (ACS) for
556 geographies of interest that are confined to the outer-layer of geographies specified here.
558 Parameters
559 ----------
560 area_threshold
561 What percentage of the inner-layer set of geographies' areas must be within the outer-layer
562 geography area. Default 0.7.
564 geographic_specifiers
565 A set of geographic specifiers specifying the geographies on which queried data, for the
566 inner-layer set of geographies, should be confined within.
568 To view available fully-specified geographic paths for an ACS dataset, reference the
569 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference
570 the `~check_path_existence()` function to see whether or not they are supported for an ACS
571 dataset of interest and, if they are supported, all geographic paths containing those
572 specifiers of interest.
574 Returns
575 -------
576 A :py:class:`acspsuedo.query._ConfinedDownload` instance.
578 *Note*: This class supports a (synchronous) download method, whose parameter space is the same
579 as that of the normal `acspsuedo.query.download()` function.
580 """
581 if (0 > area_threshold) or (area_threshold > 1):
582 raise ValueError("Valid area threshold values must be between 0 and 1.")
583 return _ConfinedDownload(area_threshold, **geographic_specifiers)
587class _ConfinedDownload:
588 """
589 Handler for downloading ACS data at geographic specifiers that are
590 confined to a different scope than readily permissible.
591 """
592 def __init__(self, area_threshold: t.Union[int, float], **geograhic_specifiers) -> None:
593 self._area_threshold = area_threshold
594 self._geographic_specifiers = geograhic_specifiers
596 # Internals for:
597 # - Checking if a query attempt has been made, and
598 # - The outer-layer geography (if an attempt is successful)
599 self._query_attempt = False
600 self._outer_geography = None
602 @property
603 def area_threshold(self):
604 """
605 The percentage of the inner-layer of geographies' areas that must be within the
606 outer-layer geography area.
607 """
608 return self._area_threshold
610 @area_threshold.setter
611 def area_threshold(self, new_threshold: t.Union[int, float]):
612 if (0 > new_threshold) or (new_threshold > 1):
613 raise ValueError("Valid area threshold values must be between 0 and 1.")
614 self._area_threshold = new_threshold
616 @property
617 def geographic_specifiers(self):
618 """
619 The geographic specifiers indicating the outer-layer geography area to which
620 inner-level geographies from queried data will be confined to.
621 """
622 return self._geographic_specifiers
624 @geographic_specifiers.setter
625 def geographic_specifiers(self, new_specififers: t.Dict[t.Any, t.Any]):
626 # Reset query attempt state.
627 if self._geographic_specifiers != new_specififers:
628 self._query_attempt = False
629 self._geographic_specifiers = new_specififers
631 def __repr__(self) -> str:
632 return "_ConfinedDownload(area_threshold = {}, geographic_specifiers = {{{}}})".format(
633 self._area_threshold,
634 ', '.join([f"{k} = {v}" for k, v in self._geographic_specifiers.items()])
635 )
637 def __eq__(self, other) -> bool:
638 if isinstance(other, _ConfinedDownload):
639 return (self._area_threshold == other._area_threshold) and \
640 (self._geographic_specifiers == other.geographic_specifiers)
641 return False
643 def download(
644 self,
645 dataset: str,
646 year: int,
647 *,
648 variables: t.Optional[t.Union[t.List[str], str]] = None,
649 tables: t.Optional[t.Union[t.List[str], str]] = None,
650 drop_annotation_variables: bool = True,
651 convert_to_na: bool = True,
652 include_geometries: bool = False,
653 with_geometry_id_columns: bool = False,
654 **geographic_specifiers
655 ) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:
656 """
657 Download data from the United States Census Bureau's American Community Survey's (ACS) for
658 geographies of interest that are confined to the outer-layer of geographies specified from
659 this function.
661 Note that you can specify particular variables, tables, or some combination of the two.
663 Parameters
664 ----------
665 dataset
666 A supported ACS dataset.
668 To view the list of supported datasets, as well as their respectively available
669 years, see `acspsuedo.datasets`.
671 year
672 A calendar year for the ACS dataset.
674 Note that this calendar year must be available for the specified ACS dataset
675 of interest.
677 variables
678 A variable, or list of variables, to be queried from the ACS dataset.
680 tables
681 A dataset table, or list of tables, which must be supported by the ACS dataset of interest.
683 drop_annotation_variables
684 The Bureau often attaches supplementary, non-required attribute and margin-of-error information
685 for estimate data. Indicate whether or not to drop this information. Default `True`.
687 convert_to_na
688 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`.
690 include_geometries
691 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER
692 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`.
694 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the
695 non-existence of corresponding geometric information for certain scopes, this may not always
696 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame`
697 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame`
698 containing the former in addition to the respective geometric information.
700 with_geometry_id_columns
701 If `include_geometries` is True, indicate whether or not to append the geometric information
702 with their respective identifier columns. Default `False`.
704 Note: These columns have been made to cohere with those identifier variables/columns requested
705 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this
706 additional identifier information, you can set this setting to `True`.
708 geographic_specifiers
709 The set of inner-layer geographic specifiers indicating the geographies to which queried data
710 references. Queried data at this inner-layer, in turn, will be confined to the outer-layer
711 geography.
713 To view available fully-specified geographic paths for an ACS dataset, reference the
714 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference
715 the `~check_path_existence()` function to see whether or not they are supported for an ACS
716 dataset of interest and, if they are supported, all geographic paths containing those
717 specifiers of interest.
719 Returns
720 -------
721 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest.
723 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return
724 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information.
726 Notes
727 -----
728 An empty :py:class:`pandas.DataFrame` (or :py:class:`geopandas.GeoDataFrame`, if `add_geometries`
729 is `'True'`) may be returned. This corresponds to a scenario in which there are no inner-layer
730 geographies so much as touching the border of the outer-layer geography.
732 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are
733 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key,
734 set the key in your operating system's environment, e.g.,
735 ```
736 import os
737 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here
738 ```
739 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the
740 OS environment key is prioritized.
742 The configuration for the locations of these settings can be customized.
743 ```
744 from acspsuedo.query import api_key_config
745 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key
746 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key
747 ```
748 """
749 inner_data = download(
750 dataset = dataset,
751 year = year,
752 variables = variables,
753 tables = tables,
754 drop_annotation_variables = drop_annotation_variables,
755 convert_to_na = convert_to_na,
756 with_geometry_id_columns = with_geometry_id_columns,
757 include_geometries = True,
758 **geographic_specifiers
759 )
760 # To accomodate for cases when a TIGER shapefile cannot be found (thanks
761 # to our earlier configurations, a warning is automatically raised)
762 if not isinstance(inner_data, gpd.GeoDataFrame):
763 return inner_data
765 inner_crs = inner_data.crs # <- Keep the original Coordinate Referencing System
767 outer_data = self._get_outer_download(dataset, year, variables, tables)
769 if outer_data is None:
770 msg = \
771 f"\nCould not locate the appropriate TIGER shapefile for the outer-layer set " \
772 f"of geographies given by {self._geographic_specifiers} for the {year} calendar\n" \
773 f"year.\n" \
774 "\nAs a result, the returned set of data corresponds to data downloaded solely from " \
775 "the reference of the inner-layer set of geographic specifiers."
777 warnings.warn( msg, UserWarning )
778 return inner_data
780 # Necessary to avoid modifying the referenced object
781 outer_data = outer_data.copy()
783 confined_data = self.__confined_data_fmtter(inner_data, outer_data, inner_crs, include_geometries)
785 return confined_data
787 def __confined_data_fmtter(
788 self,
789 inner_data: gpd.GeoDataFrame,
790 outer_data: gpd.GeoDataFrame,
791 inner_data_crs: t.Any,
792 include_geometries: bool
793 ) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:
794 # Project to Web-Mercator
795 inner_data.to_crs(3857, inplace=True)
796 outer_data.to_crs(3857, inplace=True)
798 # Keep the geometries
799 inner_data['inner_geometry'] = inner_data.geometry
800 outer_data['outer_geometry'] = outer_data.geometry
802 # Confining
803 confined_data = inner_data.sjoin(
804 outer_data,
805 how = 'inner',
806 predicate = 'intersects',
807 lsuffix = 'inner',
808 rsuffix = 'outer'
809 )
811 # Thresholding
812 thresholded_data = confined_data[
813 confined_data['inner_geometry'].intersection(confined_data['outer_geometry']).area >=
814 self._area_threshold * confined_data['inner_geometry'].area
815 ]
817 # Cleaning (to ensure consistency w/o confinement)
818 thresholded_data.drop(columns = ['inner_geometry', 'outer_geometry',
819 *[col for col in thresholded_data if col.endswith('_outer')]],
820 inplace = True)
821 thresholded_data.columns = [col.rstrip('_inner') for col in thresholded_data.columns]
822 thresholded_data.reset_index(drop = True, inplace=True)
824 # Restore to the original/inner CRS
825 thresholded_data.to_crs(inner_data_crs, inplace=True)
827 # Drop the geometry column (if indicated False)
828 if not include_geometries:
829 thresholded_data.drop(columns = ['geometry'], inplace = True)
831 return thresholded_data
835 def _get_outer_download(
836 self,
837 dataset: str,
838 year: int,
839 variables: t.Optional[t.Union[t.List[str], str]] = None,
840 tables: t.Optional[t.Union[t.List[str], str]] = None,
841 ) -> t.Optional[gpd.GeoDataFrame]:
842 """Internal for retrieving the outer-layer geography."""
843 self.__set_outer_download(dataset, year, variables, tables)
844 return self._outer_geography
846 def __set_outer_download(
847 self,
848 dataset: str,
849 year: int,
850 variables: t.Optional[t.Union[t.List[str], str]] = None,
851 tables: t.Optional[t.Union[t.List[str], str]] = None,
852 ) -> None:
853 """Internal for the actual call to the outer-layer geography."""
854 # Run an attempt only if we have not previous made a previous
855 # attempt to query the outer-layer geographies.
856 if not self._query_attempt:
857 with warnings.catch_warnings():
858 warnings.simplefilter('ignore')
860 outer_data = download(
861 dataset = dataset,
862 year = year,
863 variables = variables,
864 tables = tables,
865 include_geometries = True,
866 **self._geographic_specifiers
867 )
869 # Given our downloads are confined to the outer-layer
870 # of geographies, retain only the geometry column.
871 if isinstance(outer_data, gpd.GeoDataFrame):
872 outer_data = outer_data[['geometry']].copy()
874 self._outer_geography = outer_data if isinstance(outer_data, gpd.GeoDataFrame) else None
876 # Set the query attempt to True, indicating if future queries
877 # are made w/ the same set of geographic specifiers, we should
878 # retrieve the stored info from the instance and don't run API
879 # calls.
880 self._query_attempt = True