Coverage for acspsuedo / query.py: 98%

169 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-11 16:02 +0000

1""" 

2Main-level entry point for downloading Census Bureau data. 

3 

4Note that it is recommended to obtain an API key if you are 

5making many (500+) queries in a daily session. An API key 

6is free to obtain at: https://api.census.gov/data/key_signup.html. 

7""" 

8 

9import typing as t 

10import warnings 

11 

12import aiohttp 

13import pandas as pd 

14import geopandas as gpd 

15import numpy as np 

16 

17from acspsuedo.fips import STATE_FIPS 

18from acspsuedo.source.geog import GeoSpecFmtter, ApiKeyConfig 

19from acspsuedo.source.shpfile import ShpfileFormatterException 

20from acspsuedo.source.shpfile_fmt import GEO_SPEC_METADATA 

21from acspsuedo.source.cache import VariableCache 

22from acspsuedo.source.na_values import REPLACEMENT_VALUES 

23from acspsuedo.source.low.protocols import fetch_table, batch_fetch_content 

24import acspsuedo.source.shpfile 

25 

26 

27 

28api_key_config: ApiKeyConfig = ApiKeyConfig() 

29""" 

30Configuration settings for the API key. 

31 

32Note that you can specify the location of your API key through one of three ways: 

33- By assigning it to the `API_KEY` attribute (prioritized) 

34- By setting it in the operating system environment 

35- By writing it in a textfile (`./api_key.txt`) in the working directory. 

36 

37You can customize the locations of the last two settings with the respective attributes: 

38- `api_key_config.OS_ENV_LOCATION`, for the operating system location 

39- `api_key_config.FILE_PATH`, for the file path location 

40""" 

41 

42variable_cache: VariableCache = VariableCache() 

43""" 

44Internal source for caching metadata information regarding tables and variables across 

45all American Community Survey datasets. 

46 

47This is exposed here in case you wish to customize caching preferences and/or view any 

48information regarding variables/tables via the methods of this instance. 

49""" 

50 

51shapefile_handler: acspsuedo.source.shpfile.ShpFileHandler = acspsuedo.source.shpfile.shapefile_handler 

52""" 

53Internal handler interface for TIGER shapefiles. 

54 

55This is exposed here in the scenario that you may want to customize caching preferences. 

56There are three such caching preferences that you may customize: 

57 

58- `shapefile_handler.auto_cache` (`bool`; default True) 

59  

60 Indicate whether or not to automatically cache extracted shapefile 

61 information. 

62 

63- `shapefile_handler.cache_path` (:py:class:`pathlib.Path` or `string`; 

64 default `Path.home() / 'cache' / 'acspsuedo' / 'TIGER_shapefiles'`) 

65  

66 If `auto_cache` is True, `cache_path` specifies the caching location 

67 of extracted shapefiles. 

68 

69- `shapefile_handler.track_updated_cache` (`bool`; default True) 

70  

71 Indicate whether or not tracked shapefiles posited in the previous 

72 cache location should be moved if/when a new cache location should 

73 be specified. For the justification of this part of the handler 

74 interface, please check out the `acspsuedo.source.shpfile` module. 

75""" 

76 

77# Two helper functions, for viewing geographic scopes metadata 

78check_path_existence = GeoSpecFmtter.check_path_existence 

79view_geographic_paths = GeoSpecFmtter.view_geographic_paths 

80 

81 

82 

83 

84def download( 

85 dataset: str, 

86 year: int, 

87 *, 

88 variables: t.Optional[t.Union[t.List[str], str]] = None, 

89 tables: t.Optional[t.Union[t.List[str], str]] = None, 

90 drop_annotation_variables: bool = True, 

91 convert_to_na: bool = True, 

92 include_geometries: bool = False, 

93 with_geometry_id_columns: bool = False, 

94 **geographic_specifiers 

95) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]: 

96 """ 

97 Download data from the United States Census Bureau's American Community Survey's (ACS) for 

98 some geographies of interest. 

99 

100 Note that you can specify particular variables, tables, or some combination of the two. 

101 

102 Parameters 

103 ---------- 

104 dataset 

105 A supported ACS dataset. 

106  

107 To view the list of supported datasets, as well as their respectively available 

108 years, see `acspsuedo.datasets`. 

109 

110 year 

111 A calendar year for the ACS dataset. 

112 

113 Note that this calendar year must be available for the specified ACS dataset 

114 of interest. 

115 

116 variables 

117 A variable, or list of variables, to be queried from the ACS dataset. 

118 

119 tables 

120 A dataset table, or list of tables, which must be supported by the ACS dataset of interest. 

121 

122 drop_annotation_variables 

123 The Bureau often attaches supplementary, non-required attribute and margin-of-error information 

124 for estimate data. Indicate whether or not to drop this information. Default `True`. 

125  

126 convert_to_na 

127 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`. 

128 

129 include_geometries 

130 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER 

131 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`. 

132 

133 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the 

134 non-existence of corresponding geometric information for certain scopes, this may not always 

135 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame` 

136 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame` 

137 containing the former in addition to the respective geometric information. 

138 

139 with_geometry_id_columns 

140 If `include_geometries` is True, indicate whether or not to append the geometric information 

141 with their respective identifier columns. Default `False`. 

142 

143 Note: These columns have been made to cohere with those identifier variables/columns requested 

144 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this 

145 additional identifier information, you can set this setting to `True`. 

146 

147 geographic_specifiers 

148 A set of geographic specifiers specifying the geographies on which queried data 

149 should be restricted to. 

150 

151 To view available fully-specified geographic paths for an ACS dataset, reference the 

152 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference 

153 the `~check_path_existence()` function to see whether or not they are supported for an ACS 

154 dataset of interest and, if they are supported, all geographic paths containing those 

155 specifiers of interest. 

156 

157 Returns 

158 ------- 

159 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest. 

160 

161 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return 

162 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information. 

163 

164 Notes 

165 ----- 

166 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are 

167 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key, 

168 set the key in your operating system's environment, e.g., 

169 ``` 

170 import os 

171 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here 

172 ``` 

173 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the 

174 OS environment key is prioritized. 

175  

176 The configuration for the locations of these settings can be customized. 

177 ``` 

178 from acspsuedo.query import api_key_config 

179 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key 

180 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key 

181 ``` 

182 """ 

183 

184 urls, meta_d, geographic_specifiers = _fmt_download_url( 

185 dataset = dataset, 

186 year = year, 

187 vars = variables, 

188 tbls = tables, 

189 drop_annotation_vars = drop_annotation_variables, 

190 **geographic_specifiers 

191 ) 

192 df = fetch_table(urls) 

193 df = _df_cleaner(df, year, meta_d, convert_to_na, **geographic_specifiers) 

194 

195 if include_geometries: 

196 df = append_geographic_info(df, year, with_geometry_id_columns, **geographic_specifiers) 

197 

198 return df 

199 

200 

201async def async_download( 

202 session: aiohttp.ClientSession, 

203 dataset: str, 

204 year: int, 

205 *, 

206 variables: t.Optional[t.Union[t.List[str], str]] = None, 

207 tables: t.Optional[t.Union[t.List[str], str]] = None, 

208 drop_annotation_variables: bool = True, 

209 convert_to_na: bool = True, 

210 retry_rate: int = 30, 

211 timeout_rate: t.Union[float, int] = 0.1, 

212 include_geometries: bool = False, 

213 with_geometry_id_columns: bool = False, 

214 **geographic_specifiers 

215) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]: 

216 """ 

217 Asynchronous implementation for downloading data from the United States Census Bureau's 

218 American Community Survey's (ACS) datasets. Execution unit is that of a concurrent model 

219 (since fetches are I/O-bound tasks; thread safety ensured). 

220 

221 Note that you can specify particular variables, tables, or some combination of the two. 

222 

223 Parameters 

224 ---------- 

225 session 

226 A(n) :py:class:`aiohttp.ClientSession` interface/context manager. 

227  

228 dataset 

229 A supported ACS dataset. 

230  

231 To view the list of supported datasets, as well as their respectively available 

232 years, see `acspsuedo.datasets`. 

233 

234 year 

235 A calendar year for the ACS dataset. 

236 

237 Note that this calendar year must be available for the specified ACS dataset 

238 of interest. 

239 

240 variables 

241 A variable, or list of variables, to be queried from the ACS dataset. 

242 

243 tables 

244 A dataset table, or list of tables, which must be supported by the ACS dataset of interest. 

245 

246 drop_annotation_variables 

247 The Bureau often attaches supplementary, non-required attribute and margin-of-error information 

248 for estimate data. Indicate whether or not to drop this information. Default `True`. 

249  

250 convert_to_na 

251 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`. 

252 

253 retry_rate 

254 In case of server-based blocking, indicate how many attempts should be made per URL before skipping. 

255 Default 30. 

256 

257 timeout_rate 

258 In case of querying large amounts of tables/variables, by how much (in seconds) should each request 

259 attempt be delayed by. Default 0.1 seconds. 

260 

261 include_geometries 

262 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER 

263 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`. 

264 

265 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the 

266 non-existence of corresponding geometric information for certain scopes, this may not always 

267 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame` 

268 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame` 

269 containing the former in addition to the respective geometric information. 

270 

271 with_geometry_id_columns 

272 If `include_geometries` is True, indicate whether or not to append the geometric information 

273 with their respective identifier columns. Default `False`. 

274 

275 Note: These columns have been made to cohere with those identifier variables/columns requested 

276 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this 

277 additional identifier information, you can set this setting to `True`. 

278 

279 geographic_specifiers 

280 A set of geographic specifiers specifying the geographies on which queried data 

281 should be restricted to. 

282 

283 To view available fully-specified geographic paths for an ACS dataset, reference the 

284 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference 

285 the `~check_path_existence()` function to see whether or not they are supported for the ACS 

286 dataset of interest and, if they are supported, all geographic paths containing those 

287 specifiers of interest. 

288 

289 Returns 

290 ------- 

291 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest. 

292 

293 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return 

294 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information. 

295 

296 Notes 

297 ----- 

298 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are 

299 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key, 

300 set the key in your operating system's environment, e.g., 

301 ``` 

302 import os 

303 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here 

304 ``` 

305 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the 

306 OS environment key is prioritized. 

307  

308 The configuration for the locations of these settings can be customized. 

309 ``` 

310 from acspsuedo.query import api_key_config 

311 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key 

312 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key 

313 ``` 

314 """ 

315 

316 # This does the heavy lifting prior to sending requests. This is the reason 

317 # why thread safety is ensured since metadata formatting requires caching, but 

318 # because it has been implemented as a synchronous routine, and runs precisely 

319 # once to generate the urls/metadata, the cache is not modified as queries are 

320 # being fetched. 

321 # Moreover, the execution unit is a concurrent model, since fetches are I/O bound, 

322 # so the issue of thread safety should absolutely be of no concen. 

323 urls, meta_d, geographic_specifiers = _fmt_download_url( 

324 dataset = dataset, 

325 year = year, 

326 vars = variables, 

327 tbls = tables, 

328 drop_annotation_vars = drop_annotation_variables, 

329 **geographic_specifiers 

330 ) 

331 df = await batch_fetch_content(session, urls, retry_rate, timeout_rate) 

332 df = _df_cleaner(df, year, meta_d, convert_to_na, **geographic_specifiers) 

333 

334 if include_geometries: 

335 df = append_geographic_info(df, year, with_geometry_id_columns, **geographic_specifiers) 

336 

337 return df 

338 

339 

340 

341def _df_cleaner( 

342 df: pd.DataFrame, 

343 year: int, 

344 meta_dict: t.Dict[t.Any, t.Any], 

345 drop_na: bool = True, 

346 **geo_specifiers 

347) -> pd.DataFrame: 

348 """ 

349 Internal dataframe cleaner for cleaning queried Census Bureau data. 

350 

351 Parameters 

352 ---------- 

353 meta_dict 

354 A dictionary containing data types for each of the queried variables. 

355  

356 drop_na 

357 Indicate whether or not special values should be replaced with `np.nan` values. 

358 Default `True`. 

359 

360 geo_specifiers 

361 A set of geographic specifiers specifying the geographies on which queried data 

362 should be restricted to. 

363 """ 

364 

365 # Create a year column 

366 df['YEAR'] = year 

367 

368 # Upper case columns 

369 df.columns = [col.upper() for col in df.columns] 

370 

371 # Move identifier columns to the front 

372 geo_col_labs = GeoSpecFmtter.get_geo_cols(**geo_specifiers) 

373 id_cols = [col for col in ['NAME', 'GEO_ID', 'UCGID', *geo_col_labs, 'YEAR'] 

374 if col in list(df.columns)] 

375 data_cols = sorted([col for col in list(df.columns) if col not in id_cols]) 

376 df = df[id_cols + data_cols] 

377 

378 # Drop duplicate columns 

379 df = df.iloc[:, ~df.columns.duplicated()].copy() 

380 

381 # If found, sort by GEO_ID. Else, sort by id columns. 

382 if 'GEO_ID' in df.columns: 

383 df['GEO_ID'] = [col.split('US', 1)[-1] for col in df['GEO_ID']] 

384 df.sort_values(by = 'GEO_ID', ignore_index=True, inplace=True) 

385 else: 

386 if 'NAME' in id_cols: 

387 id_cols.remove('NAME') 

388 df.sort_values(by = id_cols, ignore_index=True, inplace=True) 

389 

390 # Set column dtypes 

391 df = variable_cache._set_dtypes(df, meta_dict) 

392 

393 # Drop NA values (if specified; default 'True') 

394 if drop_na: 

395 df.replace(REPLACEMENT_VALUES, np.nan, inplace = True) 

396 

397 return df 

398 

399 

400 

401 

402def _fmt_download_url( 

403 dataset: str, 

404 year: int, 

405 vars: t.Optional[t.Union[t.List[str], str]] = None, 

406 tbls: t.Optional[t.Union[t.List[str], str]] = None, 

407 drop_annotation_vars: bool = True, 

408 **geog_specifiers 

409) -> t.Tuple[list[str], t.Dict[str, str], t.Dict[str, str]]: 

410 """ 

411 Internal for formatting multiple download links to the Census Bureau 

412 (in the potential case that a user may query 50+ variables at once). 

413 """ 

414 url, geog_specifiers = _fmt_url(dataset, year, **geog_specifiers) 

415 vars, meta_dict = variable_cache._vars_metadata(dataset, year, vars, tbls, drop_annotation_vars) 

416 

417 urls = [url.format(','.join(vars[i:i+50]) ) for i in range(0, len(vars) + 1, 50) ] 

418 

419 return urls, meta_dict, geog_specifiers 

420 

421 

422def _fmt_url(dataset: str, year: int, **geog_specifiers): 

423 """ 

424 Formatter skeleton for the URLs. 

425 """ 

426 geo_specs, geog_specifiers = GeoSpecFmtter.get_fmt_path(dataset, year, **geog_specifiers) 

427 url_fmtter = 'https://api.census.gov/data/{year}/{dataset}?get={var}{geo_specs}{key}' 

428 

429 fmt_url = url_fmtter.format( 

430 var = '{}', 

431 dataset = dataset, 

432 year = str(year), 

433 geo_specs = geo_specs, 

434 key = api_key_config._get_api_key() 

435 ) 

436 

437 return fmt_url, geog_specifiers 

438 

439 

440 

441 

442 

443def append_geographic_info( 

444 data_df: pd.DataFrame, 

445 year: int, 

446 with_geometry_columns: bool = False, 

447 **geographic_specifiers: t.Any 

448) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]: 

449 """ 

450 Add geographic data to the queried dataset. 

451 

452 Parameters 

453 ---------- 

454 data_df 

455 The returned :py:class:`pandas.DataFrame` instance generated from the 

456 query. 

457 

458 year 

459 The calendar year for the queried data. 

460 

461 with_geometry_columns 

462 Indicate whether or not to attach any 

463 

464 geographic_specifiers 

465 The set of geographic specifiers that were used from the query. 

466 

467 Returns 

468 ------- 

469 A :py:class:`geopandas.GeoDataFrame` instance containing geographic information 

470 for each record/row from the fetched Census Bureau data *provided a shapefile 

471 is found*. Otherwise, the originally queried data is returned alongside a helpful 

472 warning. 

473 """ 

474 gdf = _get_shpfile(data_df, year, **geographic_specifiers) 

475 

476 if gdf.empty: 

477 # Suggests shapefile non-existence and/or naming convention issues. 

478 # Because of our internal handler set-up, a warning is automatically raised. 

479 return data_df 

480 

481 shpfile_scope = list(geographic_specifiers)[-1] 

482 _, merge_df_cols, _, merge_gdf_cols, _ = GEO_SPEC_METADATA[shpfile_scope] 

483 

484 if 'GEO_ID' in data_df.columns and 'GEO_ID' in gdf.columns: 

485 merge_df_cols = ['GEO_ID', *merge_df_cols] 

486 merge_gdf_cols = ['GEO_ID', *merge_gdf_cols] 

487 

488 merge_gdf = pd.merge( 

489 right = data_df, 

490 left = gdf, 

491 how = "right", 

492 right_on = [*merge_df_cols, 'YEAR'], 

493 left_on = [*merge_gdf_cols, 'YEAR'], 

494 ) 

495 

496 # Move the queried data columns to the front 

497 df_cols = list(data_df.columns) 

498 gdf_cols = [col for col in merge_gdf.columns if col not in [*df_cols, 'geometry']] 

499 

500 merge_gdf = merge_gdf[ 

501 df_cols + (gdf_cols if with_geometry_columns else []) + ['geometry'] 

502 ] 

503 

504 return merge_gdf 

505 

506 

507 

508def _get_shpfile( 

509 df: pd.DataFrame, 

510 year: int, 

511 **geographic_specifiers: t.Any 

512) -> gpd.GeoDataFrame: 

513 """ 

514 Underlying that makes runs to the shapefile database (if files are not previously 

515 cached) with our shapefile handler and formats any shapefiles requiring a 'state' 

516 outer point of reference. 

517 """ 

518 try: 

519 gdf = shapefile_handler.fetch_tiger_shpfile(year, **geographic_specifiers) 

520 if gdf is None: 

521 # Indicates our anticipated naming convention issues or shapefile non-existence 

522 return gpd.GeoDataFrame() 

523 return gdf 

524 

525 # Specific handling for shapefiles whose outer point of reference is 'state' 

526 except ShpfileFormatterException: 

527 

528 # If 'STATE' is found, great. Otherwise, we load all 'state' FIPS codes and 

529 # extract the shapefiles from each. The latter typically arises when users 

530 # run data queries for certain geographic pathways of length 1 and/or containing 

531 # wildcard operators (e.g. {'congressional_district': '*'}). 

532 if 'STATE' in df.columns: 

533 states = list(df['STATE'].unique()) 

534 else: 

535 states = list(STATE_FIPS.values()) 

536 

537 # Shapefile scope governed by last specifier 

538 shpfile_scope = list(geographic_specifiers)[-1] 

539 

540 gdfs = [] 

541 with warnings.catch_warnings(): 

542 warnings.simplefilter('ignore') 

543 for state in states: 

544 gdf = shapefile_handler.fetch_tiger_shpfile(year, **{'state': state, shpfile_scope: '*'}) 

545 if gdf is not None: 

546 gdfs.append(gdf) 

547 

548 gdf = pd.concat(gdfs, ignore_index = True) 

549 return gdf 

550 

551 

552 

553def confined_download(area_threshold: t.Union[int, float] = 0.7, **geographic_specifiers): 

554 """ 

555 Download data from the United States Census Bureau's American Community Survey's (ACS) for 

556 geographies of interest that are confined to the outer-layer of geographies specified here. 

557 

558 Parameters 

559 ---------- 

560 area_threshold 

561 What percentage of the inner-layer set of geographies' areas must be within the outer-layer 

562 geography area. Default 0.7. 

563 

564 geographic_specifiers 

565 A set of geographic specifiers specifying the geographies on which queried data, for the 

566 inner-layer set of geographies, should be confined within. 

567 

568 To view available fully-specified geographic paths for an ACS dataset, reference the 

569 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference 

570 the `~check_path_existence()` function to see whether or not they are supported for an ACS 

571 dataset of interest and, if they are supported, all geographic paths containing those 

572 specifiers of interest. 

573 

574 Returns 

575 ------- 

576 A :py:class:`acspsuedo.query._ConfinedDownload` instance. 

577 

578 *Note*: This class supports a (synchronous) download method, whose parameter space is the same 

579 as that of the normal `acspsuedo.query.download()` function. 

580 """ 

581 if (0 > area_threshold) or (area_threshold > 1): 

582 raise ValueError("Valid area threshold values must be between 0 and 1.") 

583 return _ConfinedDownload(area_threshold, **geographic_specifiers) 

584 

585 

586 

587class _ConfinedDownload: 

588 """ 

589 Handler for downloading ACS data at geographic specifiers that are 

590 confined to a different scope than readily permissible. 

591 """ 

592 def __init__(self, area_threshold: t.Union[int, float], **geograhic_specifiers) -> None: 

593 self._area_threshold = area_threshold 

594 self._geographic_specifiers = geograhic_specifiers 

595 

596 # Internals for: 

597 # - Checking if a query attempt has been made, and 

598 # - The outer-layer geography (if an attempt is successful) 

599 self._query_attempt = False 

600 self._outer_geography = None 

601 

602 @property 

603 def area_threshold(self): 

604 """ 

605 The percentage of the inner-layer of geographies' areas that must be within the 

606 outer-layer geography area. 

607 """ 

608 return self._area_threshold 

609 

610 @area_threshold.setter 

611 def area_threshold(self, new_threshold: t.Union[int, float]): 

612 if (0 > new_threshold) or (new_threshold > 1): 

613 raise ValueError("Valid area threshold values must be between 0 and 1.") 

614 self._area_threshold = new_threshold 

615 

616 @property 

617 def geographic_specifiers(self): 

618 """ 

619 The geographic specifiers indicating the outer-layer geography area to which 

620 inner-level geographies from queried data will be confined to. 

621 """ 

622 return self._geographic_specifiers 

623 

624 @geographic_specifiers.setter 

625 def geographic_specifiers(self, new_specififers: t.Dict[t.Any, t.Any]): 

626 # Reset query attempt state. 

627 if self._geographic_specifiers != new_specififers: 

628 self._query_attempt = False 

629 self._geographic_specifiers = new_specififers 

630 

631 def __repr__(self) -> str: 

632 return "_ConfinedDownload(area_threshold = {}, geographic_specifiers = {{{}}})".format( 

633 self._area_threshold, 

634 ', '.join([f"{k} = {v}" for k, v in self._geographic_specifiers.items()]) 

635 ) 

636 

637 def __eq__(self, other) -> bool: 

638 if isinstance(other, _ConfinedDownload): 

639 return (self._area_threshold == other._area_threshold) and \ 

640 (self._geographic_specifiers == other.geographic_specifiers) 

641 return False 

642 

643 def download( 

644 self, 

645 dataset: str, 

646 year: int, 

647 *, 

648 variables: t.Optional[t.Union[t.List[str], str]] = None, 

649 tables: t.Optional[t.Union[t.List[str], str]] = None, 

650 drop_annotation_variables: bool = True, 

651 convert_to_na: bool = True, 

652 include_geometries: bool = False, 

653 with_geometry_id_columns: bool = False, 

654 **geographic_specifiers 

655 ) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]: 

656 """ 

657 Download data from the United States Census Bureau's American Community Survey's (ACS) for 

658 geographies of interest that are confined to the outer-layer of geographies specified from 

659 this function. 

660 

661 Note that you can specify particular variables, tables, or some combination of the two. 

662 

663 Parameters 

664 ---------- 

665 dataset 

666 A supported ACS dataset. 

667  

668 To view the list of supported datasets, as well as their respectively available 

669 years, see `acspsuedo.datasets`. 

670 

671 year 

672 A calendar year for the ACS dataset. 

673 

674 Note that this calendar year must be available for the specified ACS dataset 

675 of interest. 

676 

677 variables 

678 A variable, or list of variables, to be queried from the ACS dataset. 

679 

680 tables 

681 A dataset table, or list of tables, which must be supported by the ACS dataset of interest. 

682 

683 drop_annotation_variables 

684 The Bureau often attaches supplementary, non-required attribute and margin-of-error information 

685 for estimate data. Indicate whether or not to drop this information. Default `True`. 

686  

687 convert_to_na 

688 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`. 

689 

690 include_geometries 

691 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER 

692 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`. 

693 

694 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the 

695 non-existence of corresponding geometric information for certain scopes, this may not always 

696 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame` 

697 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame` 

698 containing the former in addition to the respective geometric information. 

699 

700 with_geometry_id_columns 

701 If `include_geometries` is True, indicate whether or not to append the geometric information 

702 with their respective identifier columns. Default `False`. 

703 

704 Note: These columns have been made to cohere with those identifier variables/columns requested 

705 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this 

706 additional identifier information, you can set this setting to `True`. 

707 

708 geographic_specifiers 

709 The set of inner-layer geographic specifiers indicating the geographies to which queried data 

710 references. Queried data at this inner-layer, in turn, will be confined to the outer-layer 

711 geography. 

712 

713 To view available fully-specified geographic paths for an ACS dataset, reference the 

714 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference 

715 the `~check_path_existence()` function to see whether or not they are supported for an ACS 

716 dataset of interest and, if they are supported, all geographic paths containing those 

717 specifiers of interest. 

718 

719 Returns 

720 ------- 

721 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest. 

722 

723 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return 

724 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information. 

725 

726 Notes 

727 ----- 

728 An empty :py:class:`pandas.DataFrame` (or :py:class:`geopandas.GeoDataFrame`, if `add_geometries` 

729 is `'True'`) may be returned. This corresponds to a scenario in which there are no inner-layer 

730 geographies so much as touching the border of the outer-layer geography. 

731 

732 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are 

733 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key, 

734 set the key in your operating system's environment, e.g., 

735 ``` 

736 import os 

737 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here 

738 ``` 

739 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the 

740 OS environment key is prioritized. 

741  

742 The configuration for the locations of these settings can be customized. 

743 ``` 

744 from acspsuedo.query import api_key_config 

745 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key 

746 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key 

747 ``` 

748 """ 

749 inner_data = download( 

750 dataset = dataset, 

751 year = year, 

752 variables = variables, 

753 tables = tables, 

754 drop_annotation_variables = drop_annotation_variables, 

755 convert_to_na = convert_to_na, 

756 with_geometry_id_columns = with_geometry_id_columns, 

757 include_geometries = True, 

758 **geographic_specifiers 

759 ) 

760 # To accomodate for cases when a TIGER shapefile cannot be found (thanks 

761 # to our earlier configurations, a warning is automatically raised) 

762 if not isinstance(inner_data, gpd.GeoDataFrame): 

763 return inner_data 

764 

765 inner_crs = inner_data.crs # <- Keep the original Coordinate Referencing System 

766 

767 outer_data = self._get_outer_download(dataset, year, variables, tables) 

768 

769 if outer_data is None: 

770 msg = \ 

771 f"\nCould not locate the appropriate TIGER shapefile for the outer-layer set " \ 

772 f"of geographies given by {self._geographic_specifiers} for the {year} calendar\n" \ 

773 f"year.\n" \ 

774 "\nAs a result, the returned set of data corresponds to data downloaded solely from " \ 

775 "the reference of the inner-layer set of geographic specifiers." 

776 

777 warnings.warn( msg, UserWarning ) 

778 return inner_data 

779 

780 # Necessary to avoid modifying the referenced object 

781 outer_data = outer_data.copy() 

782 

783 confined_data = self.__confined_data_fmtter(inner_data, outer_data, inner_crs, include_geometries) 

784 

785 return confined_data 

786 

787 def __confined_data_fmtter( 

788 self, 

789 inner_data: gpd.GeoDataFrame, 

790 outer_data: gpd.GeoDataFrame, 

791 inner_data_crs: t.Any, 

792 include_geometries: bool 

793 ) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]: 

794 # Project to Web-Mercator 

795 inner_data.to_crs(3857, inplace=True) 

796 outer_data.to_crs(3857, inplace=True) 

797 

798 # Keep the geometries 

799 inner_data['inner_geometry'] = inner_data.geometry 

800 outer_data['outer_geometry'] = outer_data.geometry 

801 

802 # Confining 

803 confined_data = inner_data.sjoin( 

804 outer_data, 

805 how = 'inner', 

806 predicate = 'intersects', 

807 lsuffix = 'inner', 

808 rsuffix = 'outer' 

809 ) 

810 

811 # Thresholding 

812 thresholded_data = confined_data[ 

813 confined_data['inner_geometry'].intersection(confined_data['outer_geometry']).area >= 

814 self._area_threshold * confined_data['inner_geometry'].area 

815 ] 

816 

817 # Cleaning (to ensure consistency w/o confinement) 

818 thresholded_data.drop(columns = ['inner_geometry', 'outer_geometry', 

819 *[col for col in thresholded_data if col.endswith('_outer')]], 

820 inplace = True) 

821 thresholded_data.columns = [col.rstrip('_inner') for col in thresholded_data.columns] 

822 thresholded_data.reset_index(drop = True, inplace=True) 

823 

824 # Restore to the original/inner CRS 

825 thresholded_data.to_crs(inner_data_crs, inplace=True) 

826 

827 # Drop the geometry column (if indicated False) 

828 if not include_geometries: 

829 thresholded_data.drop(columns = ['geometry'], inplace = True) 

830 

831 return thresholded_data 

832 

833 

834 

835 def _get_outer_download( 

836 self, 

837 dataset: str, 

838 year: int, 

839 variables: t.Optional[t.Union[t.List[str], str]] = None, 

840 tables: t.Optional[t.Union[t.List[str], str]] = None, 

841 ) -> t.Optional[gpd.GeoDataFrame]: 

842 """Internal for retrieving the outer-layer geography.""" 

843 self.__set_outer_download(dataset, year, variables, tables) 

844 return self._outer_geography 

845 

846 def __set_outer_download( 

847 self, 

848 dataset: str, 

849 year: int, 

850 variables: t.Optional[t.Union[t.List[str], str]] = None, 

851 tables: t.Optional[t.Union[t.List[str], str]] = None, 

852 ) -> None: 

853 """Internal for the actual call to the outer-layer geography.""" 

854 # Run an attempt only if we have not previous made a previous 

855 # attempt to query the outer-layer geographies. 

856 if not self._query_attempt: 

857 with warnings.catch_warnings(): 

858 warnings.simplefilter('ignore') 

859 

860 outer_data = download( 

861 dataset = dataset, 

862 year = year, 

863 variables = variables, 

864 tables = tables, 

865 include_geometries = True, 

866 **self._geographic_specifiers 

867 ) 

868 

869 # Given our downloads are confined to the outer-layer 

870 # of geographies, retain only the geometry column. 

871 if isinstance(outer_data, gpd.GeoDataFrame): 

872 outer_data = outer_data[['geometry']].copy() 

873 

874 self._outer_geography = outer_data if isinstance(outer_data, gpd.GeoDataFrame) else None 

875 

876 # Set the query attempt to True, indicating if future queries 

877 # are made w/ the same set of geographic specifiers, we should 

878 # retrieve the stored info from the instance and don't run API 

879 # calls. 

880 self._query_attempt = True