Coverage for acspsuedo/query.py: 98%

1"""

2Main-level entry point for downloading Census Bureau data.

4Note that it is recommended to obtain an API key if you are

5making many (500+) queries in a daily session. An API key

6is free to obtain at: https://api.census.gov/data/key_signup.html.

7"""

9import typing as t

10import warnings

12import aiohttp

13import pandas as pd

14import geopandas as gpd

15import numpy as np

17from acspsuedo.fips import STATE_FIPS

18from acspsuedo.source.geog import GeoSpecFmtter, ApiKeyConfig

19from acspsuedo.source.shpfile import ShpfileFormatterException

20from acspsuedo.source.shpfile_fmt import GEO_SPEC_METADATA

21from acspsuedo.source.cache import VariableCache

22from acspsuedo.source.na_values import REPLACEMENT_VALUES

23from acspsuedo.source.low.protocols import fetch_table, batch_fetch_content

24import acspsuedo.source.shpfile

28api_key_config: ApiKeyConfig = ApiKeyConfig()

29"""

30Configuration settings for the API key.

32Note that you can specify the location of your API key through one of three ways:

33- By assigning it to the `API_KEY` attribute (prioritized)

34- By setting it in the operating system environment

35- By writing it in a textfile (`./api_key.txt`) in the working directory.

37You can customize the locations of the last two settings with the respective attributes:

38- `api_key_config.OS_ENV_LOCATION`, for the operating system location

39- `api_key_config.FILE_PATH`, for the file path location

40"""

42variable_cache: VariableCache = VariableCache()

43"""

44Internal source for caching metadata information regarding tables and variables across

45all American Community Survey datasets.

47This is exposed here in case you wish to customize caching preferences and/or view any

48information regarding variables/tables via the methods of this instance.

49"""

51shapefile_handler: acspsuedo.source.shpfile.ShpFileHandler = acspsuedo.source.shpfile.shapefile_handler

52"""

53Internal handler interface for TIGER shapefiles.

55This is exposed here in the scenario that you may want to customize caching preferences.

56There are three such caching preferences that you may customize:

58- `shapefile_handler.auto_cache` (`bool`; default True)

60 Indicate whether or not to automatically cache extracted shapefile

61 information.

63- `shapefile_handler.cache_path` (:py:class:`pathlib.Path` or `string`;

64 default `Path.home() / 'cache' / 'acspsuedo' / 'TIGER_shapefiles'`)

66 If `auto_cache` is True, `cache_path` specifies the caching location

67 of extracted shapefiles.

69- `shapefile_handler.track_updated_cache` (`bool`; default True)

71 Indicate whether or not tracked shapefiles posited in the previous

72 cache location should be moved if/when a new cache location should

73 be specified. For the justification of this part of the handler

74 interface, please check out the `acspsuedo.source.shpfile` module.

75"""

77# Two helper functions, for viewing geographic scopes metadata

78check_path_existence = GeoSpecFmtter.check_path_existence

79view_geographic_paths = GeoSpecFmtter.view_geographic_paths

84def download(

85 dataset: str,

86 year: int,

87 *,

88 variables: t.Optional[t.Union[t.List[str], str]] = None,

89 tables: t.Optional[t.Union[t.List[str], str]] = None,

90 drop_annotation_variables: bool = True,

91 convert_to_na: bool = True,

92 include_geometries: bool = False,

93 with_geometry_id_columns: bool = False,

94 **geographic_specifiers

95) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:

96 """

97 Download data from the United States Census Bureau's American Community Survey's (ACS) for

98 some geographies of interest.

100 Note that you can specify particular variables, tables, or some combination of the two.

101

102 Parameters

103 ----------

104 dataset

105 A supported ACS dataset.

106

107 To view the list of supported datasets, as well as their respectively available

108 years, see `acspsuedo.datasets`.

109

110 year

111 A calendar year for the ACS dataset.

112

113 Note that this calendar year must be available for the specified ACS dataset

114 of interest.

115

116 variables

117 A variable, or list of variables, to be queried from the ACS dataset.

118

119 tables

120 A dataset table, or list of tables, which must be supported by the ACS dataset of interest.

121

122 drop_annotation_variables

123 The Bureau often attaches supplementary, non-required attribute and margin-of-error information

124 for estimate data. Indicate whether or not to drop this information. Default `True`.

125

126 convert_to_na

127 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`.

128

129 include_geometries

130 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER

131 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`.

132

133 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the

134 non-existence of corresponding geometric information for certain scopes, this may not always

135 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame`

136 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame`

137 containing the former in addition to the respective geometric information.

138

139 with_geometry_id_columns

140 If `include_geometries` is True, indicate whether or not to append the geometric information

141 with their respective identifier columns. Default `False`.

142

143 Note: These columns have been made to cohere with those identifier variables/columns requested

144 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this

145 additional identifier information, you can set this setting to `True`.

146

147 geographic_specifiers

148 A set of geographic specifiers specifying the geographies on which queried data

149 should be restricted to.

150

151 To view available fully-specified geographic paths for an ACS dataset, reference the

152 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference

153 the `~check_path_existence()` function to see whether or not they are supported for an ACS

154 dataset of interest and, if they are supported, all geographic paths containing those

155 specifiers of interest.

156

157 Returns

158 -------

159 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest.

160

161 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return

162 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information.

163

164 Notes

165 -----

166 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are

167 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key,

168 set the key in your operating system's environment, e.g.,

169 ```

170 import os

171 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here

172 ```

173 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the

174 OS environment key is prioritized.

175

176 The configuration for the locations of these settings can be customized.

177 ```

178 from acspsuedo.query import api_key_config

179 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key

180 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key

181 ```

182 """

183

184 urls, meta_d, geographic_specifiers = _fmt_download_url(

185 dataset = dataset,

186 year = year,

187 vars = variables,

188 tbls = tables,

189 drop_annotation_vars = drop_annotation_variables,

190 **geographic_specifiers

191 )

192 df = fetch_table(urls)

193 df = _df_cleaner(df, year, meta_d, convert_to_na, **geographic_specifiers)

194

195 if include_geometries:

196 df = append_geographic_info(df, year, with_geometry_id_columns, **geographic_specifiers)

197

198 return df

199

200

201async def async_download(

202 session: aiohttp.ClientSession,

203 dataset: str,

204 year: int,

205 *,

206 variables: t.Optional[t.Union[t.List[str], str]] = None,

207 tables: t.Optional[t.Union[t.List[str], str]] = None,

208 drop_annotation_variables: bool = True,

209 convert_to_na: bool = True,

210 retry_rate: int = 30,

211 timeout_rate: t.Union[float, int] = 0.1,

212 include_geometries: bool = False,

213 with_geometry_id_columns: bool = False,

214 **geographic_specifiers

215) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:

216 """

217 Asynchronous implementation for downloading data from the United States Census Bureau's

218 American Community Survey's (ACS) datasets. Execution unit is that of a concurrent model

219 (since fetches are I/O-bound tasks; thread safety ensured).

220

221 Note that you can specify particular variables, tables, or some combination of the two.

222

223 Parameters

224 ----------

225 session

226 A(n) :py:class:`aiohttp.ClientSession` interface/context manager.

227

228 dataset

229 A supported ACS dataset.

230

231 To view the list of supported datasets, as well as their respectively available

232 years, see `acspsuedo.datasets`.

233

234 year

235 A calendar year for the ACS dataset.

236

237 Note that this calendar year must be available for the specified ACS dataset

238 of interest.

239

240 variables

241 A variable, or list of variables, to be queried from the ACS dataset.

242

243 tables

244 A dataset table, or list of tables, which must be supported by the ACS dataset of interest.

245

246 drop_annotation_variables

247 The Bureau often attaches supplementary, non-required attribute and margin-of-error information

248 for estimate data. Indicate whether or not to drop this information. Default `True`.

249

250 convert_to_na

251 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`.

252

253 retry_rate

254 In case of server-based blocking, indicate how many attempts should be made per URL before skipping.

255 Default 30.

256

257 timeout_rate

258 In case of querying large amounts of tables/variables, by how much (in seconds) should each request

259 attempt be delayed by. Default 0.1 seconds.

260

261 include_geometries

262 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER

263 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`.

264

265 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the

266 non-existence of corresponding geometric information for certain scopes, this may not always

267 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame`

268 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame`

269 containing the former in addition to the respective geometric information.

270

271 with_geometry_id_columns

272 If `include_geometries` is True, indicate whether or not to append the geometric information

273 with their respective identifier columns. Default `False`.

274

275 Note: These columns have been made to cohere with those identifier variables/columns requested

276 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this

277 additional identifier information, you can set this setting to `True`.

278

279 geographic_specifiers

280 A set of geographic specifiers specifying the geographies on which queried data

281 should be restricted to.

282

283 To view available fully-specified geographic paths for an ACS dataset, reference the

284 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference

285 the `~check_path_existence()` function to see whether or not they are supported for the ACS

286 dataset of interest and, if they are supported, all geographic paths containing those

287 specifiers of interest.

288

289 Returns

290 -------

291 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest.

292

293 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return

294 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information.

295

296 Notes

297 -----

298 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are

299 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key,

300 set the key in your operating system's environment, e.g.,

301 ```

302 import os

303 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here

304 ```

305 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the

306 OS environment key is prioritized.

307

308 The configuration for the locations of these settings can be customized.

309 ```

310 from acspsuedo.query import api_key_config

311 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key

312 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key

313 ```

314 """

315

316 # This does the heavy lifting prior to sending requests. This is the reason

317 # why thread safety is ensured since metadata formatting requires caching, but

318 # because it has been implemented as a synchronous routine, and runs precisely

319 # once to generate the urls/metadata, the cache is not modified as queries are

320 # being fetched.

321 # Moreover, the execution unit is a concurrent model, since fetches are I/O bound,

322 # so the issue of thread safety should absolutely be of no concen.

323 urls, meta_d, geographic_specifiers = _fmt_download_url(

324 dataset = dataset,

325 year = year,

326 vars = variables,

327 tbls = tables,

328 drop_annotation_vars = drop_annotation_variables,

329 **geographic_specifiers

330 )

331 df = await batch_fetch_content(session, urls, retry_rate, timeout_rate)

332 df = _df_cleaner(df, year, meta_d, convert_to_na, **geographic_specifiers)

333

334 if include_geometries:

335 df = append_geographic_info(df, year, with_geometry_id_columns, **geographic_specifiers)

337 return df

341def _df_cleaner(

342 df: pd.DataFrame,

343 year: int,

344 meta_dict: t.Dict[t.Any, t.Any],

345 drop_na: bool = True,

346 **geo_specifiers

347) -> pd.DataFrame:

348 """

349 Internal dataframe cleaner for cleaning queried Census Bureau data.

350

351 Parameters

352 ----------

353 meta_dict

354 A dictionary containing data types for each of the queried variables.

355

356 drop_na

357 Indicate whether or not special values should be replaced with `np.nan` values.

358 Default `True`.

359

360 geo_specifiers

361 A set of geographic specifiers specifying the geographies on which queried data

362 should be restricted to.

363 """

364

365 # Create a year column

366 df['YEAR'] = year

367

368 # Upper case columns

369 df.columns = [col.upper() for col in df.columns]

370

371 # Move identifier columns to the front

372 geo_col_labs = GeoSpecFmtter.get_geo_cols(**geo_specifiers)

373 id_cols = [col for col in ['NAME', 'GEO_ID', 'UCGID', *geo_col_labs, 'YEAR']

374 if col in list(df.columns)]

375 data_cols = sorted([col for col in list(df.columns) if col not in id_cols])

376 df = df[id_cols + data_cols]

377

378 # Drop duplicate columns

379 df = df.iloc[:, ~df.columns.duplicated()].copy()

380

381 # If found, sort by GEO_ID. Else, sort by id columns.

382 if 'GEO_ID' in df.columns:

383 df['GEO_ID'] = [col.split('US', 1)[-1] for col in df['GEO_ID']]

384 df.sort_values(by = 'GEO_ID', ignore_index=True, inplace=True)

385 else:

386 if 'NAME' in id_cols:

387 id_cols.remove('NAME')

388 df.sort_values(by = id_cols, ignore_index=True, inplace=True)

389

390 # Set column dtypes

391 df = variable_cache._set_dtypes(df, meta_dict)

392

393 # Drop NA values (if specified; default 'True')

394 if drop_na:

395 df.replace(REPLACEMENT_VALUES, np.nan, inplace = True)

397 return df

402def _fmt_download_url(

403 dataset: str,

404 year: int,

405 vars: t.Optional[t.Union[t.List[str], str]] = None,

406 tbls: t.Optional[t.Union[t.List[str], str]] = None,

407 drop_annotation_vars: bool = True,

408 **geog_specifiers

409) -> t.Tuple[list[str], t.Dict[str, str], t.Dict[str, str]]:

410 """

411 Internal for formatting multiple download links to the Census Bureau

412 (in the potential case that a user may query 50+ variables at once).

413 """

414 url, geog_specifiers = _fmt_url(dataset, year, **geog_specifiers)

415 vars, meta_dict = variable_cache._vars_metadata(dataset, year, vars, tbls, drop_annotation_vars)

416

417 urls = [url.format(','.join(vars[i:i+50]) ) for i in range(0, len(vars) + 1, 50) ]

418

419 return urls, meta_dict, geog_specifiers

420

421

422def _fmt_url(dataset: str, year: int, **geog_specifiers):

423 """

424 Formatter skeleton for the URLs.

425 """

426 geo_specs, geog_specifiers = GeoSpecFmtter.get_fmt_path(dataset, year, **geog_specifiers)

427 url_fmtter = 'https://api.census.gov/data/{year}/{dataset}?get={var}{geo_specs}{key}'

428

429 fmt_url = url_fmtter.format(

430 var = '{}',

431 dataset = dataset,

432 year = str(year),

433 geo_specs = geo_specs,

434 key = api_key_config._get_api_key()

435 )

436

437 return fmt_url, geog_specifiers

443def append_geographic_info(

444 data_df: pd.DataFrame,

445 year: int,

446 with_geometry_columns: bool = False,

447 **geographic_specifiers: t.Any

448) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:

449 """

450 Add geographic data to the queried dataset.

451

452 Parameters

453 ----------

454 data_df

455 The returned :py:class:`pandas.DataFrame` instance generated from the

456 query.

457

458 year

459 The calendar year for the queried data.

460

461 with_geometry_columns

462 Indicate whether or not to attach any

463

464 geographic_specifiers

465 The set of geographic specifiers that were used from the query.

466

467 Returns

468 -------

469 A :py:class:`geopandas.GeoDataFrame` instance containing geographic information

470 for each record/row from the fetched Census Bureau data *provided a shapefile

471 is found*. Otherwise, the originally queried data is returned alongside a helpful

472 warning.

473 """

474 gdf = _get_shpfile(data_df, year, **geographic_specifiers)

475

476 if gdf.empty:

477 # Suggests shapefile non-existence and/or naming convention issues.

478 # Because of our internal handler set-up, a warning is automatically raised.

479 return data_df

480

481 shpfile_scope = list(geographic_specifiers)[-1]

482 _, merge_df_cols, _, merge_gdf_cols, _ = GEO_SPEC_METADATA[shpfile_scope]

483

484 if 'GEO_ID' in data_df.columns and 'GEO_ID' in gdf.columns:

485 merge_df_cols = ['GEO_ID', *merge_df_cols]

486 merge_gdf_cols = ['GEO_ID', *merge_gdf_cols]

487

488 merge_gdf = pd.merge(

489 right = data_df,

490 left = gdf,

491 how = "right",

492 right_on = [*merge_df_cols, 'YEAR'],

493 left_on = [*merge_gdf_cols, 'YEAR'],

494 )

495

496 # Move the queried data columns to the front

497 df_cols = list(data_df.columns)

498 gdf_cols = [col for col in merge_gdf.columns if col not in [*df_cols, 'geometry']]

499

500 merge_gdf = merge_gdf[

501 df_cols + (gdf_cols if with_geometry_columns else []) + ['geometry']

502 ]

504 return merge_gdf

508def _get_shpfile(

509 df: pd.DataFrame,

510 year: int,

511 **geographic_specifiers: t.Any

512) -> gpd.GeoDataFrame:

513 """

514 Underlying that makes runs to the shapefile database (if files are not previously

515 cached) with our shapefile handler and formats any shapefiles requiring a 'state'

516 outer point of reference.

517 """

518 try:

519 gdf = shapefile_handler.fetch_tiger_shpfile(year, **geographic_specifiers)

520 if gdf is None:

521 # Indicates our anticipated naming convention issues or shapefile non-existence

522 return gpd.GeoDataFrame()

523 return gdf

524

525 # Specific handling for shapefiles whose outer point of reference is 'state'

526 except ShpfileFormatterException:

527

528 # If 'STATE' is found, great. Otherwise, we load all 'state' FIPS codes and

529 # extract the shapefiles from each. The latter typically arises when users

530 # run data queries for certain geographic pathways of length 1 and/or containing

531 # wildcard operators (e.g. {'congressional_district': '*'}).

532 if 'STATE' in df.columns:

533 states = list(df['STATE'].unique())

534 else:

535 states = list(STATE_FIPS.values())

536

537 # Shapefile scope governed by last specifier

538 shpfile_scope = list(geographic_specifiers)[-1]

539

540 gdfs = []

541 with warnings.catch_warnings():

542 warnings.simplefilter('ignore')

543 for state in states:

544 gdf = shapefile_handler.fetch_tiger_shpfile(year, **{'state': state, shpfile_scope: '*'})

545 if gdf is not None:

546 gdfs.append(gdf)

547

548 gdf = pd.concat(gdfs, ignore_index = True)

549 return gdf

550

551

552

553def confined_download(area_threshold: t.Union[int, float] = 0.7, **geographic_specifiers):

554 """

555 Download data from the United States Census Bureau's American Community Survey's (ACS) for

556 geographies of interest that are confined to the outer-layer of geographies specified here.

557

558 Parameters

559 ----------

560 area_threshold

561 What percentage of the inner-layer set of geographies' areas must be within the outer-layer

562 geography area. Default 0.7.

563

564 geographic_specifiers

565 A set of geographic specifiers specifying the geographies on which queried data, for the

566 inner-layer set of geographies, should be confined within.

567

568 To view available fully-specified geographic paths for an ACS dataset, reference the

569 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference

570 the `~check_path_existence()` function to see whether or not they are supported for an ACS

571 dataset of interest and, if they are supported, all geographic paths containing those

572 specifiers of interest.

573

574 Returns

575 -------

576 A :py:class:`acspsuedo.query._ConfinedDownload` instance.

577

578 *Note*: This class supports a (synchronous) download method, whose parameter space is the same

579 as that of the normal `acspsuedo.query.download()` function.

580 """

581 if (0 > area_threshold) or (area_threshold > 1):

582 raise ValueError("Valid area threshold values must be between 0 and 1.")

583 return _ConfinedDownload(area_threshold, **geographic_specifiers)

584

585

586

587class _ConfinedDownload:

588 """

589 Handler for downloading ACS data at geographic specifiers that are

590 confined to a different scope than readily permissible.

591 """

592 def __init__(self, area_threshold: t.Union[int, float], **geograhic_specifiers) -> None:

593 self._area_threshold = area_threshold

594 self._geographic_specifiers = geograhic_specifiers

595

596 # Internals for:

597 # - Checking if a query attempt has been made, and

598 # - The outer-layer geography (if an attempt is successful)

599 self._query_attempt = False

600 self._outer_geography = None

601

602 @property

603 def area_threshold(self):

604 """

605 The percentage of the inner-layer of geographies' areas that must be within the

606 outer-layer geography area.

607 """

608 return self._area_threshold

609

610 @area_threshold.setter

611 def area_threshold(self, new_threshold: t.Union[int, float]):

612 if (0 > new_threshold) or (new_threshold > 1):

613 raise ValueError("Valid area threshold values must be between 0 and 1.")

614 self._area_threshold = new_threshold

615

616 @property

617 def geographic_specifiers(self):

618 """

619 The geographic specifiers indicating the outer-layer geography area to which

620 inner-level geographies from queried data will be confined to.

621 """

622 return self._geographic_specifiers

623

624 @geographic_specifiers.setter

625 def geographic_specifiers(self, new_specififers: t.Dict[t.Any, t.Any]):

626 # Reset query attempt state.

627 if self._geographic_specifiers != new_specififers:

628 self._query_attempt = False

629 self._geographic_specifiers = new_specififers

630

631 def __repr__(self) -> str:

632 return "_ConfinedDownload(area_threshold = {}, geographic_specifiers = {{{}}})".format(

633 self._area_threshold,

634 ', '.join([f"{k} = {v}" for k, v in self._geographic_specifiers.items()])

635 )

636

637 def __eq__(self, other) -> bool:

638 if isinstance(other, _ConfinedDownload):

639 return (self._area_threshold == other._area_threshold) and \

640 (self._geographic_specifiers == other.geographic_specifiers)

641 return False

642

643 def download(

644 self,

645 dataset: str,

646 year: int,

647 *,

648 variables: t.Optional[t.Union[t.List[str], str]] = None,

649 tables: t.Optional[t.Union[t.List[str], str]] = None,

650 drop_annotation_variables: bool = True,

651 convert_to_na: bool = True,

652 include_geometries: bool = False,

653 with_geometry_id_columns: bool = False,

654 **geographic_specifiers

655 ) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:

656 """

657 Download data from the United States Census Bureau's American Community Survey's (ACS) for

658 geographies of interest that are confined to the outer-layer of geographies specified from

659 this function.

660

661 Note that you can specify particular variables, tables, or some combination of the two.

662

663 Parameters

664 ----------

665 dataset

666 A supported ACS dataset.

667

668 To view the list of supported datasets, as well as their respectively available

669 years, see `acspsuedo.datasets`.

670

671 year

672 A calendar year for the ACS dataset.

673

674 Note that this calendar year must be available for the specified ACS dataset

675 of interest.

676

677 variables

678 A variable, or list of variables, to be queried from the ACS dataset.

679

680 tables

681 A dataset table, or list of tables, which must be supported by the ACS dataset of interest.

682

683 drop_annotation_variables

684 The Bureau often attaches supplementary, non-required attribute and margin-of-error information

685 for estimate data. Indicate whether or not to drop this information. Default `True`.

686

687 convert_to_na

688 Indicate whether or not special values should be replaced with `np.nan` values. Default `True`.

689

690 include_geometries

691 Indicate whether or not to incorporate geometric information from the Census Bureau's TIGER

692 Shapefile database. Useful for geographical analysis and/or map visualization. Default `False`.

693

694 Note: Due to the changes in naming conventions for TIGER shapefiles over the years or the

695 non-existence of corresponding geometric information for certain scopes, this may not always

696 return geometries. In which case, the return type would be a(n) :py:class:`pandas.DataFrame`

697 instance containing the queried data and not the anticipated :py:class:`geopandas.GeoDataFrame`

698 containing the former in addition to the respective geometric information.

699

700 with_geometry_id_columns

701 If `include_geometries` is True, indicate whether or not to append the geometric information

702 with their respective identifier columns. Default `False`.

703

704 Note: These columns have been made to cohere with those identifier variables/columns requested

705 from the data query and thus are deemed redundant. Nevertheless, if you wish to specify this

706 additional identifier information, you can set this setting to `True`.

707

708 geographic_specifiers

709 The set of inner-layer geographic specifiers indicating the geographies to which queried data

710 references. Queried data at this inner-layer, in turn, will be confined to the outer-layer

711 geography.

712

713 To view available fully-specified geographic paths for an ACS dataset, reference the

714 `~view_geographic_paths()` function. If you know your geographic specifier(s), reference

715 the `~check_path_existence()` function to see whether or not they are supported for an ACS

716 dataset of interest and, if they are supported, all geographic paths containing those

717 specifiers of interest.

718

719 Returns

720 -------

721 A :py:class:`pandas.DataFrame` containing the queried American Community Survey data of interest.

722

723 If `add_geometries` is `'True'`, and TIGER shapefile data exists for the queried data, the return

724 is a :py:class:`geopandas.GeoDataFrame` containing geometric shapefile information.

725

726 Notes

727 -----

728 An empty :py:class:`pandas.DataFrame` (or :py:class:`geopandas.GeoDataFrame`, if `add_geometries`

729 is `'True'`) may be returned. This corresponds to a scenario in which there are no inner-layer

730 geographies so much as touching the border of the outer-layer geography.

731

732 For multiple queries (500+) in a session, it is recommended to obtain an API key. API keys are

733 free to obtain at https://api.census.gov/data/key_signup.html. If you wish to specify an API key,

734 set the key in your operating system's environment, e.g.,

735 ```

736 import os

737 os.environ['CENSUS_BUREAU_API_KEY'] = your_api_key_here

738 ```

739 or write it to a textfile in the working directory (`./api_key.txt`). If both are supplied, the

740 OS environment key is prioritized.

741

742 The configuration for the locations of these settings can be customized.

743 ```

744 from acspsuedo.query import api_key_config

745 api_key_config.FILE_PATH = 'location/to/new_file_path.txt' # <- Set a custom filepath containing the key

746 api_key_config.OS_ENV_LOCATION = 'new_env_key' # <- Set a custom environment location to the key

747 ```

748 """

749 inner_data = download(

750 dataset = dataset,

751 year = year,

752 variables = variables,

753 tables = tables,

754 drop_annotation_variables = drop_annotation_variables,

755 convert_to_na = convert_to_na,

756 with_geometry_id_columns = with_geometry_id_columns,

757 include_geometries = True,

758 **geographic_specifiers

759 )

760 # To accomodate for cases when a TIGER shapefile cannot be found (thanks

761 # to our earlier configurations, a warning is automatically raised)

762 if not isinstance(inner_data, gpd.GeoDataFrame):

763 return inner_data

764

765 inner_crs = inner_data.crs # <- Keep the original Coordinate Referencing System

766

767 outer_data = self._get_outer_download(dataset, year, variables, tables)

768

769 if outer_data is None:

770 msg = \

771 f"\nCould not locate the appropriate TIGER shapefile for the outer-layer set " \

772 f"of geographies given by {self._geographic_specifiers} for the {year} calendar\n" \

773 f"year.\n" \

774 "\nAs a result, the returned set of data corresponds to data downloaded solely from " \

775 "the reference of the inner-layer set of geographic specifiers."

776

777 warnings.warn( msg, UserWarning )

778 return inner_data

779

780 # Necessary to avoid modifying the referenced object

781 outer_data = outer_data.copy()

782

783 confined_data = self.__confined_data_fmtter(inner_data, outer_data, inner_crs, include_geometries)

784

785 return confined_data

786

787 def __confined_data_fmtter(

788 self,

789 inner_data: gpd.GeoDataFrame,

790 outer_data: gpd.GeoDataFrame,

791 inner_data_crs: t.Any,

792 include_geometries: bool

793 ) -> t.Union[pd.DataFrame, gpd.GeoDataFrame]:

794 # Project to Web-Mercator

795 inner_data.to_crs(3857, inplace=True)

796 outer_data.to_crs(3857, inplace=True)

797

798 # Keep the geometries

799 inner_data['inner_geometry'] = inner_data.geometry

800 outer_data['outer_geometry'] = outer_data.geometry

801

802 # Confining

803 confined_data = inner_data.sjoin(

804 outer_data,

805 how = 'inner',

806 predicate = 'intersects',

807 lsuffix = 'inner',

808 rsuffix = 'outer'

809 )

810

811 # Thresholding

812 thresholded_data = confined_data[

813 confined_data['inner_geometry'].intersection(confined_data['outer_geometry']).area >=

814 self._area_threshold * confined_data['inner_geometry'].area

815 ]

816

817 # Cleaning (to ensure consistency w/o confinement)

818 thresholded_data.drop(columns = ['inner_geometry', 'outer_geometry',

819 *[col for col in thresholded_data if col.endswith('_outer')]],

820 inplace = True)

821 thresholded_data.columns = [col.rstrip('_inner') for col in thresholded_data.columns]

822 thresholded_data.reset_index(drop = True, inplace=True)

823

824 # Restore to the original/inner CRS

825 thresholded_data.to_crs(inner_data_crs, inplace=True)

826

827 # Drop the geometry column (if indicated False)

828 if not include_geometries:

829 thresholded_data.drop(columns = ['geometry'], inplace = True)

830

831 return thresholded_data

832

833

834

835 def _get_outer_download(

836 self,

837 dataset: str,

838 year: int,

839 variables: t.Optional[t.Union[t.List[str], str]] = None,

840 tables: t.Optional[t.Union[t.List[str], str]] = None,

841 ) -> t.Optional[gpd.GeoDataFrame]:

842 """Internal for retrieving the outer-layer geography."""

843 self.__set_outer_download(dataset, year, variables, tables)

844 return self._outer_geography

845

846 def __set_outer_download(

847 self,

848 dataset: str,

849 year: int,

850 variables: t.Optional[t.Union[t.List[str], str]] = None,

851 tables: t.Optional[t.Union[t.List[str], str]] = None,

852 ) -> None:

853 """Internal for the actual call to the outer-layer geography."""

854 # Run an attempt only if we have not previous made a previous

855 # attempt to query the outer-layer geographies.

856 if not self._query_attempt:

857 with warnings.catch_warnings():

858 warnings.simplefilter('ignore')

859

860 outer_data = download(

861 dataset = dataset,

862 year = year,

863 variables = variables,

864 tables = tables,

865 include_geometries = True,

866 **self._geographic_specifiers

867 )

868

869 # Given our downloads are confined to the outer-layer

870 # of geographies, retain only the geometry column.

871 if isinstance(outer_data, gpd.GeoDataFrame):

872 outer_data = outer_data[['geometry']].copy()

873

874 self._outer_geography = outer_data if isinstance(outer_data, gpd.GeoDataFrame) else None

875

876 # Set the query attempt to True, indicating if future queries

877 # are made w/ the same set of geographic specifiers, we should

878 # retrieve the stored info from the instance and don't run API

879 # calls.

880 self._query_attempt = True

Coverage for acspsuedo / query.py: 98%

169 statements