Coverage for censusdis/data.py: 94%

324 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-04-03 05:39 +0000

1# Copyright (c) 2022, 2023 Darren Erik Vengroff 

2""" 

3Utilities for loading census data. 

4 

5This module relies on the US Census API, which 

6it wraps in a pythonic manner. 

7""" 

8 

9import warnings 

10from logging import getLogger 

11from typing import ( 

12 Dict, 

13 Iterable, 

14 List, 

15 Mapping, 

16 Optional, 

17 Tuple, 

18 Union, 

19) 

20 

21import io 

22import requests 

23import gzip 

24 

25import geopandas as gpd 

26import numpy as np 

27import pandas as pd 

28 

29import censusdis.geography as cgeo 

30import censusdis.maps as cmap 

31from censusdis.impl.exceptions import CensusApiException 

32from censusdis.impl.fetch import data_from_url 

33from censusdis.impl.us_census_shapefiles import ( 

34 add_geography, 

35 clip_water, 

36 infer_geo_level, 

37 geo_query_from_data_query_inner_geo, 

38) 

39from censusdis.impl.varcache import VariableCache 

40from censusdis.impl.varsource.base import VintageType 

41from censusdis.impl.varsource.censusapi import CensusApiVariableSource 

42from censusdis.values import ALL_SPECIAL_VALUES 

43from censusdis.datasets import ACS5, DECENNIAL_PUBLIC_LAW_94_171 

44from censusdis.states import ABBREVIATIONS_FROM_IDS 

45 

46import censusdis.impl.fetch 

47 

48 

49logger = getLogger(__name__) 

50 

51 

52GeoFilterType = Optional[Union[str, Iterable[str]]] 

53""" 

54The type we accept for geographic filters. 

55 

56They are used for the values of `kwargs` to 

57:py:func:`download`. 

58 

59These filters are either single values as a string, 

60or, if multivalued, then an iterable containing all 

61the values allowed by the filter. For example:: 

62 

63 import censusdis.data as ced 

64 

65 from censusdis.states import NJ, NY, CT 

66 

67 # Two different kinds of kwarg for `state=`, both of 

68 # which are of `GeoFilterType`: 

69 df_one_state = ced.download("aca/acs5", 2020, ["NAME"], state=NJ) 

70 df_tri_state = ced.download("aca/acs5", 2020, ["NAME"], state=[NJ, NY, CT]) 

71""" 

72 

73 

74def _gf2s(geo_filter: GeoFilterType) -> Optional[str]: 

75 """ 

76 Convert a filter to a string. 

77 

78 For the Census API, multiple values are encoded 

79 in a single comma separated string. 

80 """ 

81 if geo_filter is None or isinstance(geo_filter, str): 

82 return geo_filter 

83 return ",".join(geo_filter) 

84 

85 

86_MAX_VARIABLES_PER_DOWNLOAD = 50 

87""" 

88The maximum number of variables we can ask for in one census API query. 

89 

90The U.S. Census sets this limit, not us. In order to not expose our 

91users to the limit, :py:func:`~download` mostly obscures the fact that 

92requests to download more than this many variables are broken into 

93multiple calls to the census API and then the results are stitched back 

94together be either merging or concatenation. This is all handled in 

95:py:func:`~_download_multiple`. 

96""" 

97 

98 

99__dw_strategy_metrics = {"merge": 0, "concat": 0} 

100""" 

101Counters for how often we use each strategy for wide tables. 

102""" 

103 

104 

105def _download_wide_strategy_metrics() -> Dict[str, int]: 

106 """ 

107 Metrics on which strategies have been used for wide tables. 

108 

109 Returns 

110 ------- 

111 A dictionary of metrics on how often each strategy has 

112 been used. 

113 """ 

114 return dict(**__dw_strategy_metrics) 

115 

116 

117def _download_multiple( 

118 dataset: str, 

119 vintage: VintageType, 

120 download_variables: List[str], 

121 *, 

122 query_filter: Dict[str, str], 

123 api_key: Optional[str], 

124 census_variables: "VariableCache", 

125 with_geometry: bool, 

126 with_geometry_columns: bool, 

127 tiger_shapefiles_only: bool, 

128 row_keys: Union[str, Iterable[str]], 

129 **kwargs: cgeo.InSpecType, 

130) -> pd.DataFrame: 

131 """ 

132 Download data in groups of columns and concatenate the results together. 

133 

134 The reason for this function is that the API will only return a maximum 

135 of 50 columns per query. This function downloads wider data 50 columns 

136 at a time and concatenates them. 

137 

138 Parameters 

139 ---------- 

140 dataset 

141 The dataset to download from. For example `"acs/acs5"`, 

142 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. 

143 vintage 

144 The vintage to download data for. For most data sets this is 

145 an integer year, for example, `2020`. But for 

146 a timeseries data set, pass the string `'timeseries'`. 

147 download_variables 

148 The census variables to download, for example `["NAME", "B01001_001E"]`. 

149 query_filter 

150 A dictionary of values to filter on. For example, if 

151 `query_filter={'NAICS2017': '72251'}` then only rows 

152 where the variable `NAICS2017` has a value of `'72251'` 

153 will be returned. 

154 

155 This filtering is done on the server side, not the client 

156 side, so it is far more efficient than querying without a 

157 query filter and then manually filtering the results. 

158 with_geometry 

159 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row 

160 will have a geometry that is a cartographic boundary suitable for platting 

161 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

162 for details of the shapefiles that will be downloaded on your behalf to 

163 generate these boundaries. 

164 with_geometry_columns 

165 If `True` keep all the additional columns that come with shapefiles 

166 downloaded to get geometry information. 

167 tiger_shapefiles_only 

168 If `True` only look for TIGER shapefiles. If `False`, first look 

169 for CB shapefiles 

170 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html), 

171 which are more suitable for plotting maps, then fall back on the full 

172 TIGER files 

173 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) 

174 only if CB is not available. This is mainly set to `True` only 

175 when `with_geometry_columns` is also set to `True`. The reason 

176 is that the additional columns in the shapefiles are different 

177 in the CB files than in the TIGER files. 

178 api_key 

179 An optional API key. If you don't have or don't use a key, the number 

180 of calls you can make will be limited. 

181 variable_cache 

182 A cache of metadata about variables. 

183 row_keys 

184 An optional set of identifier keys to help merge together requests for more than the census API limit of 

185 50 variables per query. These keys are useful for census datasets such as the Current Population Survey 

186 where the geographic identifiers do not uniquely identify each row. 

187 kwargs 

188 A specification of the geometry that we want data for. 

189 

190 Returns 

191 ------- 

192 The full results of the query with all columns. 

193 

194 """ 

195 # Divide the variables into groups. If row keys are provided, include them in each chunk of variables, 

196 # while respecting the variable max 

197 if row_keys: 

198 chunk_size = _MAX_VARIABLES_PER_DOWNLOAD - len(row_keys) 

199 variable_groups = [ 

200 # black and flake8 disagree about the whitespace before ':' here... 

201 # We need to drop duplicates in each chunk of variables 

202 # since the row_key variables might already be present in one of the chunks 

203 [ 

204 item 

205 for item in row_keys 

206 + download_variables[start : start + chunk_size] # noqa: E203 

207 if item not in row_keys 

208 or row_keys.index(item) 

209 == ( 

210 row_keys 

211 + download_variables[start : start + chunk_size] # noqa: E203 

212 ).index(item) 

213 ] 

214 for start in range(0, len(download_variables), chunk_size) 

215 ] 

216 else: 

217 variable_groups = [ 

218 # black and flake8 disagree about the whitespace before ':' here... 

219 download_variables[start : start + _MAX_VARIABLES_PER_DOWNLOAD] # noqa: 203 

220 for start in range(0, len(download_variables), _MAX_VARIABLES_PER_DOWNLOAD) 

221 ] 

222 

223 if len(variable_groups) < 2: 

224 raise ValueError( 

225 "_download_multiple expects to be called with at least " 

226 f"{_MAX_VARIABLES_PER_DOWNLOAD + 1} variables. With fewer," 

227 "use download instead." 

228 ) 

229 

230 # Get the data for each chunk. Note that we leave out 

231 # extra geometry columns at this point. We will get them 

232 # later if we need them, but they get in the way at this 

233 # point. 

234 dfs = [ 

235 download( 

236 dataset, 

237 vintage, 

238 variable_group, 

239 query_filter=query_filter, 

240 api_key=api_key, 

241 variable_cache=census_variables, 

242 with_geometry=with_geometry and (ii == 0), 

243 with_geometry_columns=False, 

244 **kwargs, 

245 ) 

246 for ii, variable_group in enumerate(variable_groups) 

247 ] 

248 

249 # What variables came back in the first df but were not 

250 # requested? These are a key to the geography the row 

251 # represents. For example, 'STATE' amd 'COUNTY' might 

252 # be these variables if we did a county-level query to 

253 # the census API. 

254 geo_key_variables = [f for f in dfs[0].columns if f not in set(variable_groups[0])] 

255 

256 # Now that we know the geometry keys, we may have to get back the other 

257 # geometry columns we left out the first time. 

258 if with_geometry and with_geometry_columns: 

259 dfs[0] = download( 

260 dataset, 

261 vintage, 

262 variable_groups[0], 

263 query_filter=query_filter, 

264 api_key=api_key, 

265 variable_cache=census_variables, 

266 with_geometry=with_geometry, 

267 with_geometry_columns=with_geometry_columns, 

268 tiger_shapefiles_only=tiger_shapefiles_only, 

269 **kwargs, 

270 ) 

271 

272 # If we put in the geometry column, it's not part of the 

273 # key. 

274 if with_geometry: 

275 geo_key_variables = [f for f in geo_key_variables if f != "geometry"] 

276 

277 # Now we have to decide if we are going to use the merge 

278 # strategy or the concat strategy to combine the data frames 

279 # we downloaded. Why do we have two strategies? Because we are 

280 # dealing with two kinds of data. One kind, from data sets like 

281 # ACS (https://www.census.gov/programs-surveys/acs.html), 

282 # has a unique key of columns that specify geography. The other 

283 # kind, from data sets like CPS 

284 # (https://www.census.gov/programs-surveys/cps.html) doesn't. 

285 # 

286 # In the unique key case, we can join the data frames that come 

287 # back on those key columns and get the final wide data frame 

288 # we want for the user. 

289 # 

290 # In the non-unique key case, we can't do this. There data sets 

291 # may have multiple rows for a value of the key columns. We can't 

292 # join here. Instead, we can only concatenate the tables 

293 # horizontally and hope that the rows came back in the same order 

294 # for each of them. 

295 

296 # We hope to be able to merge. It is safer. If row_keys is supplied, they are included in 

297 # merge keys 

298 merge_strategy = True 

299 if row_keys: 

300 merge_keys = geo_key_variables + row_keys 

301 else: 

302 merge_keys = geo_key_variables 

303 # But if there are any non-unique keys in any df, we can't 

304 # merge. 

305 for df_slice in dfs: 

306 if len(df_slice.value_counts(merge_keys, sort=False)) != len(df_slice.index): 

307 merge_strategy = False 

308 break 

309 

310 if with_geometry and with_geometry_columns and not merge_strategy: 

311 raise ValueError( 

312 "`with_geometry_columns=True` is only supported for very wide results " 

313 "when the merge_strategy can be used. This merge strategy is used when every " 

314 "row of the result is for a unique geography, as is the case in data sets like " 

315 f'ACS5 ("{ACS5}") and DECENNIAL_PUBLIC_LAW_94_171 ("{DECENNIAL_PUBLIC_LAW_94_171}"). ' 

316 "If this functionality is really important to you (note that it would create a lot " 

317 "of duplicate geoemetry values, we suggest you set `with_geometry=False` in this call " 

318 "and then merge with a `GeoDatFrame` with the geometries you want after the fact." 

319 ) 

320 

321 if merge_strategy: 

322 # We can do the merge strategy. 

323 

324 __dw_strategy_metrics["merge"] = __dw_strategy_metrics["merge"] + 1 

325 

326 df_data = dfs[0] 

327 

328 for df_right in dfs[1:]: 

329 df_data_columns = set(df_data.columns) 

330 df_data = df_data.merge( 

331 df_right[ 

332 [ 

333 col 

334 for col in df_right.columns 

335 if (col in merge_keys) or (col not in df_data_columns) 

336 ] 

337 ], 

338 on=merge_keys, 

339 ) 

340 else: 

341 # We are going to have to fall back on the concat 

342 # strategy. Before we do the concat, however, let's 

343 # double-check that the key columns are the same in 

344 # at the corresponding row in every df. Otherwise, something 

345 # is fishy, and it is not safe to concat without mixing 

346 # data that should be in different rows. 

347 

348 rows0 = len(dfs[0].index) 

349 

350 for df_slice in dfs[1:]: 

351 if not ( 

352 rows0 == len(df_slice.index) 

353 and dfs[0][geo_key_variables].equals(df_slice[geo_key_variables]) 

354 ): 

355 # At least one difference. So we cannot use the 

356 # concat strategy either. 

357 if not row_keys: 

358 raise CensusApiException( 

359 "Neither the merge nor the concat strategy is viable. " 

360 "We made multiple queries to the census API because more than " 

361 f"{_MAX_VARIABLES_PER_DOWNLOAD} variables were requested. " 

362 "If you don't need all the variables, it is always safer to " 

363 f"download less than {_MAX_VARIABLES_PER_DOWNLOAD} variables. " 

364 f"If you need more than {_MAX_VARIABLES_PER_DOWNLOAD}, you can supply the `row_keys`" 

365 "arguement with a set of variables that uniquely identify each row." 

366 ) 

367 else: 

368 raise CensusApiException( 

369 f"Neither the merge nor the concat strategy is viable using row_keys: {row_keys}. " 

370 "The supplied keys should uniquely identify every row in the dataset to work. " 

371 "If you don't need all the variables, it is always safer to " 

372 f"download less than {_MAX_VARIABLES_PER_DOWNLOAD} variables. " 

373 ) 

374 

375 # Concat strategy is as safe as it will ever be. We hope the server 

376 # side did not reorder the results across queries. 

377 logger.info( 

378 "Using the concat strategy, which is not guaranteed reliable if " 

379 "the census API returned data for multiple sub-queries of less than " 

380 "or equal to %d in different row orders. " 

381 "It is always safest to query no more than %d " 

382 "variables at a time. Please do so unless you really need them all.", 

383 _MAX_VARIABLES_PER_DOWNLOAD, 

384 _MAX_VARIABLES_PER_DOWNLOAD, 

385 ) 

386 

387 __dw_strategy_metrics["concat"] = __dw_strategy_metrics["concat"] + 1 

388 

389 df_data = pd.concat( 

390 [dfs[0]] + [df.drop(geo_key_variables, axis="columns") for df in dfs[1:]], 

391 axis="columns", 

392 ) 

393 

394 return df_data 

395 

396 

397def download_lodes( 

398 dataset: str, 

399 vintage: VintageType, 

400 download_variables: Optional[Union[str, Iterable[str]]] = None, 

401 version: Optional[str] = None, 

402 home_geography: Optional[Union[bool, Dict[str, str]]] = None, 

403 with_geometry: bool = False, 

404 with_geometry_columns: bool = False, 

405 tiger_shapefiles_only: bool = False, 

406 remove_water: bool = False, 

407 **kwargs: cgeo.InSpecType, 

408) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 

409 """ 

410 Download LODES data from the US Census API. 

411 

412 This is typically not called directly, but instead LODES data 

413 is obtained by calling :py:func:`~download`, which then calls 

414 this as needed for LODES data sets. 

415 

416 Parameters 

417 ---------- 

418 dataset 

419 The dataset to download from. For example `"acs/acs5"`, 

420 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are 

421 symbolic names for datasets, like `ACS5` for `"acs/acs5" 

422 in :py:module:`censusdis.datasets`. 

423 vintage 

424 The vintage to download data for. For most data sets this is 

425 an integer year, for example, `2020`. But for 

426 a timeseries data set, pass the string `'timeseries'`. 

427 download_variables 

428 The census variables to download, for example `["NAME", "B01001_001E"]`. 

429 with_geometry 

430 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row 

431 will have a geometry that is a cartographic boundary suitable for platting 

432 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

433 for details of the shapefiles that will be downloaded on your behalf to 

434 generate these boundaries. 

435 with_geometry_columns 

436 If `True` keep all the additional columns that come with shapefiles 

437 downloaded to get geometry information. 

438 tiger_shapefiles_only 

439 If `True` only look for TIGER shapefiles. If `False`, first look 

440 for CB shapefiles 

441 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html), 

442 which are more suitable for plotting maps, then fall back on the full 

443 TIGER files 

444 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) 

445 only if CB is not available. This is mainly set to `True` only 

446 when `with_geometry_columns` is also set to `True`. The reason 

447 is that the additional columns in the shapefiles are different 

448 in the CB files than in the TIGER files. 

449 remove_water 

450 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and 

451 remove water areas from returned geometry. 

452 """ 

453 if version is None: 

454 version = "LODES8" 

455 

456 if isinstance(home_geography, bool): 

457 if home_geography: 

458 home_geography = dict(**kwargs) 

459 else: 

460 home_geography = None 

461 

462 bound_path = cgeo.PathSpec.partial_prefix_match(dataset, vintage, **kwargs) 

463 geo_bindings = bound_path.bindings 

464 

465 state = geo_bindings["state"] 

466 

467 if state == "*": 

468 # TODO - we could just concatenate them all. 

469 raise ValueError("Wildcards not supported for state LODES data.") 

470 

471 if state not in ABBREVIATIONS_FROM_IDS: 

472 raise ValueError(f"Unknown state id {state}") 

473 

474 state_name = ABBREVIATIONS_FROM_IDS[state].lower() 

475 

476 _, data_set_type, part_or_segment, job_type = dataset.split("/") 

477 

478 if data_set_type in ["rac", "wac"]: 

479 part_or_segment = part_or_segment.upper() 

480 

481 url = ( 

482 f"https://lehd.ces.census.gov/data/lodes/{version}/{state_name}/{data_set_type}/{state_name}_{data_set_type}_" 

483 f"{part_or_segment}_{job_type.upper()}_{vintage}.csv.gz" 

484 ) 

485 

486 logger.info(f"Downloading LODES data from {url}") 

487 

488 results = requests.get(url) 

489 if results.status_code != requests.status_codes.codes.OK: 

490 raise CensusApiException( 

491 f"Unable to get LODES data. Attempted to fetch from {url}. " 

492 f"Status: {results.status_code}; {results.reason}" 

493 ) 

494 

495 gz_content = requests.get(url).content 

496 content = gzip.decompress(gz_content) 

497 df_lodes = pd.read_csv( 

498 io.StringIO(content.decode("utf-8")), dtype={"w_geocode": str, "h_geocode": str} 

499 ) 

500 

501 # We don't need the date. 

502 df_lodes = df_lodes.drop("createdate", axis="columns") 

503 

504 # Map the geographies to the conventions censusdis uses. 

505 

506 def map_geo_cols(*, from_prefix: str, to_suffix: str = ""): 

507 df_lodes[f"STATE{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[:2] 

508 df_lodes[f"COUNTY{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[2:5] 

509 df_lodes[f"TRACT{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[5:11] 

510 df_lodes[f"BLOCK{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[11:15] 

511 

512 if data_set_type in ["od", "wac"]: 

513 map_geo_cols(from_prefix="w_") 

514 else: 

515 map_geo_cols(from_prefix="h_") 

516 

517 if data_set_type == "od": 

518 map_geo_cols(from_prefix="h_", to_suffix="_H") 

519 

520 for geocode_col in ["w_geocode", "h_geocode"]: 

521 if geocode_col in df_lodes.columns: 

522 df_lodes.drop(geocode_col, axis="columns") 

523 

524 group_keys = [] 

525 selectors = {} 

526 

527 for geo, binding in geo_bindings.items(): 

528 group_keys.append(f"{geo.upper()}") 

529 if binding != "*": 

530 selectors[f"{geo.upper()}"] = binding 

531 

532 if data_set_type == "od": 

533 if home_geography is None: 

534 for col in df_lodes.columns: 

535 if col.endswith("_H"): 

536 group_keys.append(col) 

537 else: 

538 # There is more grouping to do. 

539 home_bound_path = cgeo.PathSpec.partial_prefix_match( 

540 dataset, vintage, **home_geography 

541 ) 

542 home_geo_bindings = home_bound_path.bindings 

543 

544 for geo, binding in home_geo_bindings.items(): 

545 group_keys.append(f"{geo.upper()}_H") 

546 if binding != "*": 

547 selectors[f"{geo.upper()}_H"] = binding 

548 

549 # Filter down based on fixed bindings. 

550 if selectors: 

551 criteria = None 

552 for col, binding in selectors.items(): 

553 if criteria is None: 

554 criteria = df_lodes[col] == binding 

555 else: 

556 criteria = criteria & (df_lodes[col] == binding) 

557 df_lodes = df_lodes[criteria] 

558 

559 if download_variables is None: 

560 download_variables = [ 

561 col for col in df_lodes.columns if df_lodes[col].dtype != object 

562 ] 

563 

564 # Group based on group keys. 

565 df_lodes = df_lodes.groupby(group_keys)[download_variables].sum().reset_index() 

566 

567 if with_geometry: 

568 # We need to get the geometry and merge it in. 

569 geo_level = bound_path.path_spec.path[-1] 

570 shapefile_scope = bound_path.bindings[bound_path.path_spec.path[0]] 

571 

572 gdf_data = add_geography( 

573 df_lodes, 

574 vintage, 

575 shapefile_scope, 

576 geo_level, 

577 with_geometry_columns=with_geometry_columns, 

578 tiger_shapefiles_only=tiger_shapefiles_only, 

579 ) 

580 

581 if remove_water: 

582 gdf_data = clip_water(gdf_data, vintage) 

583 

584 return gdf_data 

585 

586 return df_lodes 

587 

588 

589def download( 

590 dataset: str, 

591 vintage: VintageType, 

592 download_variables: Optional[Union[str, Iterable[str]]] = None, 

593 *, 

594 group: Optional[Union[str, Iterable[str]]] = None, 

595 leaves_of_group: Optional[Union[str, Iterable[str]]] = None, 

596 set_to_nan: Union[bool, Iterable[int]] = True, 

597 skip_annotations: bool = True, 

598 query_filter: Optional[Dict[str, str]] = None, 

599 with_geometry: bool = False, 

600 with_geometry_columns: bool = False, 

601 tiger_shapefiles_only: bool = False, 

602 remove_water: bool = False, 

603 download_contained_within: Optional[Dict[str, cgeo.InSpecType]] = None, 

604 area_threshold: float = 0.8, 

605 api_key: Optional[str] = None, 

606 variable_cache: Optional["VariableCache"] = None, 

607 row_keys: Optional[Union[str, Iterable[str]]] = None, 

608 **kwargs: cgeo.InSpecType, 

609) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 

610 """ 

611 Download data from the US Census API. 

612 

613 This is the main API for downloading US Census data with the 

614 `censusdis` package. There are many examples of how to use 

615 this in the demo notebooks provided with the package at 

616 https://github.com/vengroff/censusdis/tree/main/notebooks. 

617 

618 *A note on variables and groups*: there are multiple ways to specify the 

619 variables you want to download, either individually in `download_variables`, 

620 by one or more groups in `group`, and by the leaves of one or more groups 

621 in `leaves_of_group`. Note that these three sources af variables are 

622 deduplicated, so you will only get one column for a variable no matter 

623 how many times it is specified. 

624 

625 *Specifying census geographies*: censusdis provides access to many 

626 census datasets, each of which can be retrieved at a particular set of 

627 geographic grains. To accomodate this, `download()` takes a set 

628 of kwargs to define the geographic level of the returned data. You can check 

629 which geographies are available for a particular dataset with the 

630 `geographies()`. 

631 

632 Parameters 

633 ---------- 

634 dataset 

635 The dataset to download from. For example `"acs/acs5"`, 

636 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are 

637 symbolic names for datasets, like `ACS5` for `"acs/acs5"` 

638 in :py:module:`censusdis.datasets`. 

639 vintage 

640 The vintage to download data for. For most data sets this is 

641 an integer year, for example, `2020`. But for 

642 a timeseries data set, pass the string `'timeseries'`. 

643 download_variables 

644 The census variables to download, for example `["NAME", "B01001_001E"]`. 

645 group 

646 One or more groups (as defined by the U.S. Census for the data set) 

647 whose variable values should be downloaded. These are in addition to 

648 any specified in `download_variables`. 

649 leaves_of_group 

650 One or more groups (as defined by the U.S. Census for the data set) 

651 whose leaf variable values should be downloaded.These are in addition to 

652 any specified in `download_variables` or `group`. See 

653 :py:meth:`VariableCache.group_leaves` for more details on the semantics of 

654 leaves vs. non-leaf group variables. 

655 set_to_nan 

656 A list of values that should be set to NaN. Normally these are special 

657 values that the U.S. Census API sometimes returns. If `True`, then all 

658 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced. 

659 If `False`, no replacements will be made. 

660 skip_annotations 

661 If `True` try to filter out `group` or `leaves_of_group` variables that are 

662 annotations rather than actual values. See :py:meth:`VariableCache.group_variables` 

663 for more details. Variable names passed in `download_variables` are not 

664 affected by this flag. 

665 query_filter 

666 A dictionary of values to filter on. For example, if 

667 `query_filter={'NAICS2017': '72251'}` then only rows 

668 where the variable `NAICS2017` has a value of `'72251'` 

669 will be returned. 

670 

671 This filtering is done on the server side, not the client 

672 side, so it is far more efficient than querying without a 

673 query filter and then manually filtering the results. 

674 with_geometry 

675 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row 

676 will have a geometry that is a cartographic boundary suitable for platting 

677 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

678 for details of the shapefiles that will be downloaded on your behalf to 

679 generate these boundaries. 

680 with_geometry_columns 

681 If `True` keep all the additional columns that come with shapefiles 

682 downloaded to get geometry information. 

683 tiger_shapefiles_only 

684 If `True` only look for TIGER shapefiles. If `False`, first look 

685 for CB shapefiles 

686 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html), 

687 which are more suitable for plotting maps, then fall back on the full 

688 TIGER files 

689 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) 

690 only if CB is not available. This is mainly set to `True` only 

691 when `with_geometry_columns` is also set to `True`. The reason 

692 is that the additional columns in the shapefiles are different 

693 in the CB files than in the TIGER files. 

694 remove_water 

695 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and 

696 remove water areas from returned geometry. 

697 download_contained_within 

698 A dictionary specifying the geography or geographies that our results 

699 should be filtered down to be contained within. 

700 area_threshold 

701 What fraction of the area of other geographies must be contained 

702 in our geography to be included. Ignored if `download_contained_within` is 

703 `None`. 

704 api_key 

705 An optional API key. If you don't have or don't use a key, the number 

706 of calls you can make will be limited to 500 per day. 

707 variable_cache 

708 A cache of metadata about variables. 

709 row_keys 

710 An optional set of identifier keys to help merge together requests for more than the census API limit of 

711 50 variables per query. These keys are useful for census datasets such as the Current Population Survey 

712 where the geographic identifiers do not uniquely identify each row. 

713 kwargs 

714 A specification of the geometry that we want data for. For example, 

715 `state = "*", county = "*"` will download county-level data for 

716 the entire US. 

717 

718 Returns 

719 ------- 

720 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data. 

721 """ 

722 if dataset.startswith("lodes/"): 

723 # Special case for the LODES data sets, which go down a completely 

724 # different path. 

725 if download_contained_within is not None: 

726 raise ValueError( 

727 "`download_contained_within` not supported for LODES data sets." 

728 ) 

729 

730 return download_lodes( 

731 dataset, 

732 vintage, 

733 download_variables, 

734 with_geometry=with_geometry, 

735 with_geometry_columns=with_geometry_columns, 

736 tiger_shapefiles_only=tiger_shapefiles_only, 

737 remove_water=remove_water, 

738 **kwargs, 

739 ) 

740 

741 if download_contained_within is not None: 

742 # Put the contained_within context around it. 

743 return contained_within( 

744 area_threshold=area_threshold, **download_contained_within 

745 ).download( 

746 dataset, 

747 vintage, 

748 download_variables, 

749 group=group, 

750 leaves_of_group=leaves_of_group, 

751 set_to_nan=set_to_nan, 

752 skip_annotations=skip_annotations, 

753 query_filter=query_filter, 

754 with_geometry=with_geometry, 

755 with_geometry_columns=with_geometry_columns, 

756 tiger_shapefiles_only=tiger_shapefiles_only, 

757 remove_water=remove_water, 

758 api_key=api_key, 

759 variable_cache=variable_cache, 

760 row_keys=row_keys, 

761 **kwargs, 

762 ) 

763 

764 if variable_cache is None: 

765 variable_cache = variables 

766 

767 # Ensure list operations work 

768 if row_keys: 

769 row_keys = list(row_keys) 

770 

771 # The side effect here is to prime the cache. 

772 cgeo.geo_path_snake_specs(dataset, vintage) 

773 

774 if set_to_nan is True: 

775 set_to_nan = ALL_SPECIAL_VALUES 

776 

777 # In case they came to us in py format, as kwargs often do. 

778 kwargs = { 

779 cgeo.path_component_from_snake(dataset, vintage, k): v 

780 for k, v in kwargs.items() 

781 } 

782 

783 # Parse out the download variables 

784 download_variables = _parse_download_variables( 

785 dataset, 

786 vintage, 

787 download_variables=download_variables, 

788 group=group, 

789 leaves_of_group=leaves_of_group, 

790 skip_annotations=skip_annotations, 

791 variable_cache=variable_cache, 

792 ) 

793 

794 if len(download_variables) <= _MAX_VARIABLES_PER_DOWNLOAD and row_keys: 

795 warnings.warn( 

796 "\n The row_keys argument is intended to be used only when the number of requested" 

797 "\n variables exceeds the Census defined limit of 50" 

798 "\n The supplied value(s) will be ignored", 

799 UserWarning, 

800 ) 

801 # Special case if we are trying to get too many fields. 

802 if len(download_variables) > _MAX_VARIABLES_PER_DOWNLOAD: 

803 return _download_multiple( 

804 dataset, 

805 vintage, 

806 download_variables, 

807 api_key=api_key, 

808 census_variables=variable_cache, 

809 query_filter=query_filter, 

810 with_geometry=with_geometry, 

811 with_geometry_columns=with_geometry_columns, 

812 tiger_shapefiles_only=tiger_shapefiles_only, 

813 row_keys=row_keys, 

814 **kwargs, 

815 ) 

816 

817 # Prefetch all the types before we load the data. 

818 # That way we fail fast if a field is not known. 

819 _prefetch_variable_types(dataset, vintage, download_variables, variable_cache) 

820 # Also check that the row_keys, if supplied, are present in the dataset 

821 if row_keys: 

822 _prefetch_variable_types(dataset, vintage, row_keys, variable_cache) 

823 

824 # If we were given a list, join it together into 

825 # a comma-separated string. 

826 string_kwargs = {k: _gf2s(v) for k, v in kwargs.items()} 

827 

828 return _download_remote( 

829 dataset, 

830 vintage, 

831 download_variables=download_variables, 

832 set_to_nan=set_to_nan, 

833 query_filter=query_filter, 

834 with_geometry=with_geometry, 

835 with_geometry_columns=with_geometry_columns, 

836 tiger_shapefiles_only=tiger_shapefiles_only, 

837 remove_water=remove_water, 

838 api_key=api_key, 

839 variable_cache=variable_cache, 

840 **string_kwargs, 

841 ) 

842 

843 

844def _download_remote( 

845 dataset: str, 

846 vintage: VintageType, 

847 *, 

848 download_variables: List[str], 

849 set_to_nan: Union[bool, Iterable[float]] = True, 

850 query_filter: Optional[Dict[str, str]] = None, 

851 with_geometry: bool, 

852 with_geometry_columns: bool, 

853 tiger_shapefiles_only: bool, 

854 remove_water: bool, 

855 api_key: Optional[str], 

856 variable_cache: "VariableCache", 

857 **kwargs, 

858) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 

859 """ 

860 Make the actual remote call to download the data. 

861 

862 This is the final step after we have parsed out and 

863 validated the variables and geometry. 

864 

865 Parameters 

866 ---------- 

867 dataset 

868 The dataset to download from. For example `"acs/acs5"`, 

869 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. 

870 vintage 

871 The vintage to download data for. For most data sets this is 

872 an integer year, for example, `2020`. But for 

873 a timeseries data set, pass the string `'timeseries'`. 

874 download_variables 

875 The census variables to download, for example `["NAME", "B01001_001E"]`. 

876 set_to_nan 

877 A list of values that should be set to NaN. Normally these are special 

878 values that the U.S. Census API sometimes returns. If `True`, then all 

879 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced. 

880 If `False`, no replacements will be made. 

881 query_filter 

882 A dictionary of values to filter on. For example, if 

883 `query_filter={'NAICS2017': '72251'}` then only rows 

884 where the variable `NAICS2017` has a value of `'72251'` 

885 will be returned. 

886 

887 This filtering is done on the server side, not the client 

888 side, so it is far more efficient than querying without a 

889 query filter and then manually filtering the results. 

890 with_geometry 

891 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row 

892 will have a geometry that is a cartographic boundary suitable for platting 

893 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

894 for details of the shapefiles that will be downloaded on your behalf to 

895 generate these boundaries. 

896 with_geometry_columns 

897 If `True` keep all the additional columns that come with shapefiles 

898 downloaded to get geometry information. 

899 tiger_shapefiles_only 

900 If `True` only look for TIGER shapefiles. If `False`, first look 

901 for CB shapefiles 

902 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html), 

903 which are more suitable for plotting maps, then fall back on the full 

904 TIGER files 

905 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) 

906 only if CB is not available. This is mainly set to `True` only 

907 when `with_geometry_columns` is also set to `True`. The reason 

908 is that the additional columns in the shapefiles are different 

909 in the CB files than in the TIGER files. 

910 api_key 

911 An optional API key. If you don't have or don't use a key, the number 

912 of calls you can make will be limited. 

913 variable_cache 

914 A cache of metadata about variables. 

915 kwargs 

916 A specification of the geometry that we want data for. 

917 

918 Returns 

919 ------- 

920 The downloaded variables, with or without added geometry, as 

921 either a `pd.DataFrame` or `gpd.GeoDataFrame`. 

922 """ 

923 url, params, bound_path = census_table_url( 

924 dataset, 

925 vintage, 

926 download_variables, 

927 query_filter=query_filter, 

928 api_key=api_key, 

929 **kwargs, 

930 ) 

931 df_data = data_from_url(url, params) 

932 

933 # Coerce the types based on metadata about the variables. 

934 _coerce_downloaded_variable_types( 

935 dataset, vintage, download_variables, df_data, variable_cache 

936 ) 

937 

938 download_variables_upper = [dv.upper() for dv in download_variables] 

939 

940 # Put the geo fields (STATE, COUNTY, etc...) that came back up front. 

941 df_data = df_data[ 

942 [col for col in df_data.columns if col not in download_variables_upper] 

943 + download_variables_upper 

944 ] 

945 

946 # NaN out as requested. 

947 if set_to_nan is True: 

948 set_to_nan = ALL_SPECIAL_VALUES 

949 if set_to_nan: 

950 df_data = df_data.replace(list(set_to_nan), np.nan) 

951 

952 if with_geometry: 

953 # We need to get the geometry and merge it in. 

954 geo_level = bound_path.path_spec.path[-1] 

955 shapefile_scope = bound_path.bindings[bound_path.path_spec.path[0]] 

956 

957 gdf_data = add_geography( 

958 df_data, 

959 vintage, 

960 shapefile_scope, 

961 geo_level, 

962 with_geometry_columns=with_geometry_columns, 

963 tiger_shapefiles_only=tiger_shapefiles_only, 

964 ) 

965 

966 if remove_water: 

967 gdf_data = clip_water(gdf_data, vintage) 

968 

969 return gdf_data 

970 

971 return df_data 

972 

973 

974def _coerce_downloaded_variable_types( 

975 dataset: str, 

976 vintage: VintageType, 

977 download_variables: List[str], 

978 df_data: pd.DataFrame, 

979 variable_cache: "VariableCache", 

980) -> None: 

981 """ 

982 Coerce the type of each returned variable (column) in a data frame. 

983 

984 We look up the type in the metadata in `variable_cache`. 

985 

986 Parameters 

987 ---------- 

988 dataset 

989 The dataset to download from. For example `"acs/acs5"`, 

990 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. 

991 vintage 

992 The vintage to download data for. For most data sets this is 

993 an integer year, for example, `2020`. But for 

994 a timeseries data set, pass the string `'timeseries'`. 

995 download_variables 

996 The census variables to download, for example `["NAME", "B01001_001E"]`. 

997 df_data 

998 The data that came back in JSON form from the census API. 

999 variable_cache 

1000 A cache of metadata about variables. 

1001 """ 

1002 for variable in download_variables: 

1003 # predicateType does not exist in some older data sets like acs/acs3 

1004 # So in that case we just go with what we got in the JSON. But if we 

1005 # have it try to set the type. 

1006 if "predicateType" in variable_cache.get(dataset, vintage, variable): 

1007 field_type = variable_cache.get(dataset, vintage, variable)["predicateType"] 

1008 

1009 if field_type == "int" or field_type == "long": 

1010 if df_data[variable].isnull().any(): 

1011 # Some Census data sets put in null in int fields. 

1012 # We have to go with a float to make this a NaN. 

1013 # Int has no representation for NaN or None. 

1014 df_data[variable] = df_data[variable].astype(float, errors="ignore") 

1015 else: 

1016 try: 

1017 df_data[variable] = df_data[variable].astype(int) 

1018 except ValueError: 

1019 # Sometimes census metadata says int, but they 

1020 # put in float values anyway, so fall back on 

1021 # trying to get them as floats. 

1022 df_data[variable] = df_data[variable].astype( 

1023 float, errors="ignore" 

1024 ) 

1025 except OverflowError: 

1026 # Some long IDs are actually better handled as strings. 

1027 df_data[variable] = df_data[variable].astype(str) 

1028 elif field_type == "float": 

1029 df_data[variable] = df_data[variable].astype(float) 

1030 elif field_type == "string": 

1031 pass 

1032 else: 

1033 # Leave it as an object? 

1034 pass 

1035 

1036 

1037def _prefetch_variable_types( 

1038 dataset: str, 

1039 vintage: VintageType, 

1040 download_variables: List[str], 

1041 variable_cache: "VariableCache", 

1042) -> None: 

1043 """ 

1044 Prefetch the types of all the variables we are going to try to download. 

1045 

1046 This enables us to fail fast and have a better error message about the 

1047 root cause of the issue than if we just blindly put in the variable names 

1048 in the census API request and wait for it to fail. 

1049 

1050 Parameters 

1051 ---------- 

1052 dataset 

1053 The dataset to download from. For example `"acs/acs5"`, 

1054 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. 

1055 vintage 

1056 The vintage to download data for. For most data sets this is 

1057 an integer year, for example, `2020`. But for 

1058 a timeseries data set, pass the string `'timeseries'`. 

1059 

1060 download_variables 

1061 The census variables to download, for example `["NAME", "B01001_001E"]`. 

1062 variable_cache 

1063 A cache of metadata about variables. 

1064 """ 

1065 for variable in download_variables: 

1066 try: 

1067 variable_cache.get(dataset, vintage, variable) 

1068 except Exception as exc: 

1069 census_url = CensusApiVariableSource.url( 

1070 dataset, vintage, variable, response_format="html" 

1071 ) 

1072 census_variables_url = CensusApiVariableSource.variables_url( 

1073 dataset, vintage, response_format="html" 

1074 ) 

1075 

1076 raise CensusApiException( 

1077 f"Unable to get metadata on the variable {variable} from the " 

1078 f"dataset {dataset} for year {vintage} from the census API. " 

1079 f"Check the census URL for the variable ({census_url}) to ensure it exists. " 

1080 f"If not found, check {census_variables_url} for all variables in the dataset." 

1081 ) from exc 

1082 

1083 

1084def _parse_download_variables( 

1085 dataset: str, 

1086 vintage: VintageType, 

1087 *, 

1088 download_variables: Optional[Union[str, Iterable[str]]] = None, 

1089 group: Optional[Union[str, Iterable[str]]] = None, 

1090 leaves_of_group: Optional[Union[str, Iterable[str]]] = None, 

1091 skip_annotations: bool = True, 

1092 variable_cache: Optional["VariableCache"] = None, 

1093) -> List[str]: 

1094 """ 

1095 Parse out the full set of download variables. 

1096 

1097 These may be encoded in `download_variables`, `group`, and/or `leaves_of_group`. 

1098 

1099 Parameters 

1100 ---------- 

1101 dataset 

1102 The dataset to download from. For example `"acs/acs5"`, 

1103 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. 

1104 vintage 

1105 The vintage to download data for. For most data sets this is 

1106 an integer year, for example, `2020`. But for 

1107 a timeseries data set, pass the string `'timeseries'`. 

1108 download_variables 

1109 The census variables to download, for example `["NAME", "B01001_001E"]`. 

1110 group 

1111 One or more groups (as defined by the U.S. Census for the data set) 

1112 whose variable values should be downloaded. These are in addition to 

1113 any specified in `download_variables`. 

1114 leaves_of_group 

1115 One or more groups (as defined by the U.S. Census for the data set) 

1116 whose leaf variable values should be downloaded.These are in addition to 

1117 any specified in `download_variables` or `group`. See 

1118 :py:meth:`VariableCache.group_leaves` for more details on the semantics of 

1119 leaves vs. non-leaf group variables. 

1120 skip_annotations 

1121 If `True` try to filter out `group` or `leaves_of_group` variables that are 

1122 annotations rather than actual values. See :py:meth:`VariableCache.group_variables` 

1123 for more details. Variable names passed in `download_variables` are not 

1124 affected by this flag. 

1125 variable_cache 

1126 A cache of metadata about variables. 

1127 

1128 Returns 

1129 ------- 

1130 The fully expanded list of variables to download. 

1131 """ 

1132 # Turn the variables we were given into a list if they are not already. 

1133 if download_variables is None: 

1134 download_variables = [] 

1135 elif isinstance(download_variables, str): 

1136 download_variables = [download_variables] 

1137 elif not isinstance(download_variables, list): 

1138 download_variables = list(download_variables) 

1139 

1140 if group is None: 

1141 group = [] 

1142 elif isinstance(group, str): 

1143 group = [group] 

1144 

1145 if leaves_of_group is None: 

1146 leaves_of_group = [] 

1147 elif isinstance(leaves_of_group, str): 

1148 leaves_of_group = [leaves_of_group] 

1149 

1150 # Add group variables and leaves as appropriate. 

1151 group_variables: List[str] = [] 

1152 for group_name in group: 

1153 group_variables = group_variables + variable_cache.group_variables( 

1154 dataset, vintage, group_name, skip_annotations=skip_annotations 

1155 ) 

1156 group_leaf_variables: List[str] = [] 

1157 for group_name in leaves_of_group: 

1158 group_leaf_variables = group_leaf_variables + variable_cache.group_leaves( 

1159 dataset, vintage, group_name, skip_annotations=skip_annotations 

1160 ) 

1161 

1162 # Concatenate them all. 

1163 download_variables = download_variables + group_variables + group_leaf_variables 

1164 

1165 # Dedup and maintain order. 

1166 download_variables = list(dict.fromkeys(download_variables)) 

1167 

1168 return download_variables 

1169 

1170 

1171def census_table_url( 

1172 dataset: str, 

1173 vintage: VintageType, 

1174 download_variables: Iterable[str], 

1175 *, 

1176 query_filter: Optional[Dict[str, str]] = None, 

1177 api_key: Optional[str] = None, 

1178 **kwargs: cgeo.InSpecType, 

1179) -> Tuple[str, Mapping[str, str], cgeo.BoundGeographyPath]: 

1180 """ 

1181 Construct the URL to download data from the U.S. Census API. 

1182 

1183 Parameters 

1184 ---------- 

1185 dataset 

1186 The dataset to download from. For example `"acs/acs5"`, 

1187 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. 

1188 vintage 

1189 The vintage to download data for. For most data sets this is 

1190 an integer year, for example, `2020`. But for 

1191 a timeseries data set, pass the string `'timeseries'`. 

1192 download_variables 

1193 The census variables to download, for example `["NAME", "B01001_001E"]`. 

1194 query_filter 

1195 A dictionary of values to filter on. For example, if 

1196 `query_filter={'NAICS2017': '72251'}` then only rows 

1197 where the variable `NAICS2017` has a value of `'72251'` 

1198 will be returned. 

1199 

1200 This filtering is done on the server side, not the client 

1201 side, so it is far more efficient than querying without a 

1202 query filter and then manually filtering the results. 

1203 api_key 

1204 An optional API key. If you don't have or don't use a key, the number 

1205 of calls you can make will be limited. 

1206 kwargs 

1207 A specification of the geometry that we want data for. 

1208 

1209 Returns 

1210 ------- 

1211 The URL, parameters and bound path. 

1212 

1213 """ 

1214 bound_path = _bind_path_if_possible(dataset, vintage, **kwargs) 

1215 

1216 query_spec = cgeo.CensusGeographyQuerySpec( 

1217 dataset, vintage, list(download_variables), bound_path, api_key=api_key 

1218 ) 

1219 

1220 url, params = query_spec.table_url(query_filter=query_filter) 

1221 

1222 return url, params, bound_path 

1223 

1224 

1225def _bind_path_if_possible(dataset, vintage, **kwargs): 

1226 """ 

1227 Bind the path if possible. 

1228 

1229 If not, raise an exception with enough info to fix it. 

1230 """ 

1231 bound_path = cgeo.PathSpec.partial_prefix_match(dataset, vintage, **kwargs) 

1232 if bound_path is None: 

1233 if kwargs: 

1234 path_specs = cgeo.geo_path_snake_specs(dataset, vintage) 

1235 possible_path_spec_keys = set( 

1236 geo for path_name in path_specs.values() for geo in path_name 

1237 ) 

1238 

1239 non_geo_kwargs = [ 

1240 k for k in kwargs.keys() if k not in possible_path_spec_keys 

1241 ] 

1242 

1243 if non_geo_kwargs: 

1244 msg = f""" 

1245The following arguments are not recognized as non-geographic arguments or goegraphic arguments 

1246for the dataset {dataset} in vintage {vintage}: '{"', '".join(non_geo_kwargs)}'. 

1247 

1248There are two reasons why this might happen: 

1249 

12501. The arg(s) mentioned above are mispelled versions of named or geopgrahic arguments. 

12512. The arg(s) mentioned above are valid geographic arguments for some data sets and 

1252 vintages, but not for {dataset} in vintage {vintage}. 

1253 

1254""" 

1255 else: 

1256 msg = f""" 

1257Unable to match the geography specification {kwargs}. 

1258 

1259""" 

1260 

1261 raise CensusApiException( 

1262 f"{msg}" 

1263 f"Supported geographies for dataset='{dataset}' in year={vintage} are:\n" 

1264 + "\n".join( 

1265 f"{path_spec}" 

1266 for path_spec in cgeo.geo_path_snake_specs( 

1267 dataset, vintage 

1268 ).values() 

1269 ) 

1270 ) 

1271 else: 

1272 bound_path = cgeo.BoundGeographyPath("000", cgeo.PathSpec.empty_path_spec()) 

1273 

1274 return bound_path 

1275 

1276 

1277def geography_names( 

1278 dataset: str, 

1279 vintage: VintageType, 

1280 **kwargs: cgeo.InSpecType, 

1281) -> pd.DataFrame: 

1282 """ 

1283 Get the name of a specific geography. 

1284 

1285 The arguments are a subset of those to :py:func:`~download`. This 

1286 function is designed to make it easy to fetch the name of a geography 

1287 when we know the FIPS code but want a human-readable name or label for 

1288 display. 

1289 

1290 Parameters 

1291 ---------- 

1292 dataset 

1293 The dataset to download from. For example `censusdis.datasets.ACS5`. 

1294 vintage 

1295 The vintage to download data for. For example, `2020`. 

1296 kwargs 

1297 A specification of the geometry that we want data for. For example, 

1298 `state = "34", county = "017"` will download the name of Hudson County, 

1299 New Jersey. 

1300 

1301 Returns 

1302 ------- 

1303 A dataframe with columns specifying the geography and one for the name. 

1304 All column names will be in ALL CAPS. 

1305 """ 

1306 df = download(dataset, vintage, ["NAME"], **kwargs) 

1307 

1308 return df 

1309 

1310 

1311def geographies(dataset: str, vintage: VintageType) -> List[List[str]]: 

1312 """ 

1313 Determine what geographies are supported for a dataset and vintage. 

1314 

1315 This utility gives us a list of the different geography 

1316 keywords we can use in calls to :py:func:`download` with 

1317 for the given dataset and vintage. 

1318 

1319 Parameters 

1320 ---------- 

1321 dataset 

1322 The dataset to download from. For example `"acs/acs5"`, 

1323 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. 

1324 vintage 

1325 The vintage to download data for. For most data sets this is 

1326 an integer year, for example, `2020`. But for 

1327 a timeseries data set, pass the string `'timeseries'`. 

1328 

1329 Returns 

1330 ------- 

1331 A list of lists of geography keywords. Each element 

1332 of the outer list is a list of keywords that can be 

1333 used together. 

1334 """ 

1335 return list(cgeo.geo_path_snake_specs(dataset, vintage).values()) 

1336 

1337 

1338variables = VariableCache() 

1339 

1340 

1341def _intersecting_geos_kws( 

1342 dataset: str, 

1343 vintage: VintageType, 

1344 containing_geo_kwargs: cgeo.InSpecType, 

1345 **kwargs: cgeo.InSpecType, 

1346) -> cgeo.InSpecType: 

1347 """ 

1348 Construct geography keywords for intersecting geographies. 

1349 

1350 Parameters 

1351 ---------- 

1352 dataset 

1353 The dataset to download from. For example `"acs/acs5"`, 

1354 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are 

1355 symbolic names for datasets, like `ACS5` for `"acs/acs5" 

1356 in :py:module:`censusdis.datasets`. 

1357 vintage 

1358 The vintage to download data for. For most data sets this is 

1359 an integer year, for example, `2020`. But for 

1360 a timeseries data set, pass the string `'timeseries'`. 

1361 containing_geo_kwargs 

1362 Geographic keywords specifying the containing geography that we are 

1363 looking for intersections with. For example 

1364 `dict(metropolitan_statistical_area_micropolitan_statistical_area="35620")` 

1365 for the New York area CBSA. 

1366 kwargs 

1367 A specification of the geometry that we want data for, limited to those 

1368 geographies that are contained in the geography specified by `containing_geo_kwargs`. 

1369 For example, `state="*", county="*", tract="*"` will specifies county-level data for 

1370 all counties contained in the containing geography. 

1371 

1372 Returns 

1373 ------- 

1374 A dictionary of geographic keywords suitable for passing to :py:func:`~download`. 

1375 """ 

1376 # This is a fast short circuit if there is only one 

1377 # element of kwargs or the first component 

1378 # is already specified. The former is since we will have to 

1379 # query with the kwargs as they are, so we might as well 

1380 # just let our caller do it. The second case is because 

1381 # we might trim down the list, but more likely the user 

1382 # double specified at the top level, like state=. 

1383 if len(kwargs) == 1 or list(kwargs.values())[0] != "*": 

1384 return kwargs 

1385 

1386 # Download the geometry of the outer scope. 

1387 gdf_within = download( 

1388 dataset, vintage, ["NAME"], with_geometry=True, **containing_geo_kwargs 

1389 ) 

1390 

1391 # See if we can find a matching path spec. 

1392 bound_path = _bind_path_if_possible(dataset, vintage, **kwargs) 

1393 

1394 # Get the geography for the outermost level of the match. 

1395 first_binding = list(bound_path.bindings.items())[0] 

1396 

1397 containing_geo_kwargs = {first_binding[0]: first_binding[1]} 

1398 

1399 gdf_first_binding = download( 

1400 dataset, vintage, ["NAME"], with_geometry=True, **containing_geo_kwargs 

1401 ) 

1402 

1403 # Which of the first binding geographies intersect 

1404 # the area we want our final geographies to be in. 

1405 gdf_intersects = gdf_first_binding.sjoin( 

1406 gdf_within, lsuffix="FIRST", rsuffix="within" 

1407 ) 

1408 

1409 col_name = first_binding[0].replace(" ", "_").upper() 

1410 if col_name not in gdf_intersects.columns: 

1411 col_name = f"{col_name}_FIRST" 

1412 

1413 intersecting_geographies = list(gdf_intersects[col_name].unique()) 

1414 

1415 # Short circuit if there are a massive number of intersection 

1416 # geps. In this case, we'll just leave things as they came with 

1417 # the leading '*' and query them all. Otherwise the URL gets super 

1418 # long and things go a little crazy. This can happen with zip code 

1419 # tabulation areas. 

1420 if len(intersecting_geographies) > 20: 

1421 return dict(**kwargs) 

1422 

1423 intersecting_geographies = [ 

1424 geo[:-6] if geo.endswith("_FIRST") else geo for geo in intersecting_geographies 

1425 ] 

1426 

1427 geo = dict(bound_path.bindings) 

1428 

1429 geo[first_binding[0]] = intersecting_geographies 

1430 

1431 return geo 

1432 

1433 

1434class ContainedWithin: 

1435 """A representation of a geography that we want to query some other geographies that are contained within.""" 

1436 

1437 def __init__(self, area_threshold: float = 0.8, **kwargs: cgeo.InSpecType): 

1438 """ 

1439 Construct a representation of a geography that we want to query some other geographies contained within. 

1440 

1441 Parameters 

1442 ---------- 

1443 area_threshold 

1444 What fraction of the area of other geographies must be contained 

1445 in our geography to be included. 

1446 kwargs 

1447 A specification of the geometry that we want data for geometries 

1448 that are contained within. For example, 

1449 `state = "NJ", place = "01960"` will specify the city of Asbury Park, NJ. 

1450 """ 

1451 self._area_threshold = area_threshold 

1452 self._containing_kwargs = kwargs 

1453 

1454 def __eq__(self, other) -> bool: 

1455 """Are two objects equal.""" 

1456 if not isinstance(other, ContainedWithin): 

1457 return False 

1458 

1459 return ( 

1460 self._area_threshold == other._area_threshold 

1461 and self._containing_kwargs == other._containing_kwargs 

1462 ) 

1463 

1464 def __enter__(self) -> "ContainedWithin": 

1465 """ 

1466 Enter the context. 

1467 

1468 Returns 

1469 ------- 

1470 The ContainedWithin object for use within the context. 

1471 """ 

1472 return self 

1473 

1474 def __exit__(self, exc_type, exc_val, exc_tb) -> None: 

1475 """Exit the context.""" 

1476 pass 

1477 

1478 def download( 

1479 self, 

1480 dataset: str, 

1481 vintage: VintageType, 

1482 download_variables: Optional[Union[str, Iterable[str]]] = None, 

1483 *, 

1484 group: Optional[Union[str, Iterable[str]]] = None, 

1485 leaves_of_group: Optional[Union[str, Iterable[str]]] = None, 

1486 set_to_nan: Union[bool, Iterable[int]] = True, 

1487 skip_annotations: bool = True, 

1488 query_filter: Optional[Dict[str, str]] = None, 

1489 with_geometry: bool = False, 

1490 with_geometry_columns: bool = False, 

1491 tiger_shapefiles_only: bool = False, 

1492 remove_water: bool = False, 

1493 api_key: Optional[str] = None, 

1494 variable_cache: Optional["VariableCache"] = None, 

1495 row_keys: Optional[Union[str, Iterable[str]]] = None, 

1496 **kwargs: cgeo.InSpecType, 

1497 ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 

1498 """ 

1499 Download data for geographies contained within a containing geography. 

1500 

1501 Parameters 

1502 ---------- 

1503 dataset 

1504 The dataset to download from. For example `"acs/acs5"`, 

1505 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are 

1506 symbolic names for datasets, like `ACS5` for `"acs/acs5" 

1507 in :py:module:`censusdis.datasets`. 

1508 vintage 

1509 The vintage to download data for. For most data sets this is 

1510 an integer year, for example, `2020`. But for 

1511 a timeseries data set, pass the string `'timeseries'`. 

1512 download_variables 

1513 The census variables to download, for example `["NAME", "B01001_001E"]`. 

1514 group 

1515 One or more groups (as defined by the U.S. Census for the data set) 

1516 whose variable values should be downloaded. These are in addition to 

1517 any specified in `download_variables`. 

1518 leaves_of_group 

1519 One or more groups (as defined by the U.S. Census for the data set) 

1520 whose leaf variable values should be downloaded.These are in addition to 

1521 any specified in `download_variables` or `group`. See 

1522 :py:meth:`VariableCache.group_leaves` for more details on the semantics of 

1523 leaves vs. non-leaf group variables. 

1524 set_to_nan 

1525 A list of values that should be set to NaN. Normally these are special 

1526 values that the U.S. Census API sometimes returns. If `True`, then all 

1527 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced. 

1528 If `False`, no replacements will be made. 

1529 skip_annotations 

1530 If `True` try to filter out `group` or `leaves_of_group` variables that are 

1531 annotations rather than actual values. See :py:meth:`VariableCache.group_variables` 

1532 for more details. Variable names passed in `download_variables` are not 

1533 affected by this flag. 

1534 query_filter 

1535 A dictionary of values to filter on. For example, if 

1536 `query_filter={'NAICS2017': '72251'}` then only rows 

1537 where the variable `NAICS2017` has a value of `'72251'` 

1538 will be returned. 

1539 

1540 This filtering is done on the server side, not the client 

1541 side, so it is far more efficient than querying without a 

1542 query filter and then manually filtering the results. 

1543 with_geometry 

1544 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row 

1545 will have a geometry that is a cartographic boundary suitable for platting 

1546 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

1547 for details of the shapefiles that will be downloaded on your behalf to 

1548 generate these boundaries. 

1549 with_geometry_columns 

1550 If `True` keep all the additional columns that come with shapefiles 

1551 downloaded to get geometry information. 

1552 tiger_shapefiles_only 

1553 If `True` only look for TIGER shapefiles. If `False`, first look 

1554 for CB shapefiles 

1555 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html), 

1556 which are more suitable for plotting maps, then fall back on the full 

1557 TIGER files 

1558 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) 

1559 only if CB is not available. This is mainly set to `True` only 

1560 when `with_geometry_columns` is also set to `True`. The reason 

1561 is that the additional columns in the shapefiles are different 

1562 in the CB files than in the TIGER files. 

1563 remove_water 

1564 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and 

1565 remove water areas from returned geometry. 

1566 api_key 

1567 An optional API key. If you don't have or don't use a key, the number 

1568 of calls you can make will be limited to 500 per day. 

1569 variable_cache 

1570 A cache of metadata about variables. 

1571 row_keys 

1572 An optional set of identifier keys to help merge together requests for more than the census API limit of 

1573 50 variables per query. These keys are useful for census datasets such as the Current Population Survey 

1574 where the geographic identifiers do not uniquely identify each row. 

1575 kwargs 

1576 A specification of the geometry that we want data for. For example, 

1577 `state = "*", county = "*"` will download county-level data for 

1578 the entire US. 

1579 

1580 Returns 

1581 ------- 

1582 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data. 

1583 """ 

1584 geos_kwargs = _intersecting_geos_kws( 

1585 dataset, vintage, self._containing_kwargs, **kwargs 

1586 ) 

1587 

1588 gdf = download( 

1589 dataset, 

1590 vintage, 

1591 download_variables, 

1592 group=group, 

1593 leaves_of_group=leaves_of_group, 

1594 set_to_nan=set_to_nan, 

1595 skip_annotations=skip_annotations, 

1596 query_filter=query_filter, 

1597 with_geometry=True, 

1598 with_geometry_columns=with_geometry_columns, 

1599 tiger_shapefiles_only=tiger_shapefiles_only, 

1600 remove_water=remove_water, 

1601 api_key=api_key, 

1602 variable_cache=variable_cache, 

1603 row_keys=row_keys, 

1604 **geos_kwargs, 

1605 ) 

1606 

1607 # See which of these geometries are mostly contained by 

1608 # the geography we want to be within. 

1609 

1610 gdf_container = download( 

1611 dataset, vintage, ["NAME"], with_geometry=True, **self._containing_kwargs 

1612 ).drop("NAME", axis="columns") 

1613 

1614 gdf_contained = cmap.sjoin_mostly_contains( 

1615 gdf_container, gdf, area_threshold=self._area_threshold 

1616 ) 

1617 

1618 # Drop all the large container columns we don't need. 

1619 gdf_contained = gdf_contained[ 

1620 [col for col in gdf_contained.columns if not col.endswith("_large")] 

1621 ].reset_index(drop=True) 

1622 

1623 # Drop the "_small" suffix. 

1624 

1625 gdf_contained.rename( 

1626 lambda col: col[:-6] if col.endswith("_small") else col, 

1627 axis="columns", 

1628 inplace=True, 

1629 ) 

1630 

1631 if with_geometry: 

1632 # Keep the columns from the larger result. 

1633 return gdf_contained[ 

1634 [ 

1635 col 

1636 for col in gdf_container.columns 

1637 if col in gdf_contained.columns and col != "geometry" 

1638 ] 

1639 + [ 

1640 col 

1641 for col in gdf_contained.columns 

1642 if col not in gdf_container.columns or col == "geometry" 

1643 ] 

1644 ] 

1645 else: 

1646 # Drop the geometry and return a `pd.DataFrame` 

1647 return pd.DataFrame( 

1648 gdf_contained[ 

1649 [ 

1650 col 

1651 for col in gdf_container.columns 

1652 if col in gdf_contained.columns and col != "geometry" 

1653 ] 

1654 + [ 

1655 col 

1656 for col in gdf_contained.columns 

1657 if col not in gdf_container.columns and col != "geometry" 

1658 ] 

1659 ] 

1660 ) 

1661 

1662 

1663def contained_within( 

1664 area_threshold: float = 0.8, **kwargs: cgeo.InSpecType 

1665) -> ContainedWithin: 

1666 """ 

1667 Construct a representation of a geography that we want to query some other geographies contained within. 

1668 

1669 Parameters 

1670 ---------- 

1671 area_threshold 

1672 What fraction of the area of other geographies must be contained 

1673 in our geography to be included. 

1674 kwargs 

1675 A specification of the geometry that we want data for geometries 

1676 that are contained within. For example, 

1677 `state = "NJ", place = "01960"` will specify the city of Asbury Park, NJ. 

1678 """ 

1679 return ContainedWithin(area_threshold=area_threshold, **kwargs) 

1680 

1681 

1682def add_inferred_geography( 

1683 df_data: pd.DataFrame, 

1684 year: Optional[int] = None, 

1685 *, 

1686 with_geometry_columns: bool = False, 

1687 tiger_shapefiles_only: bool = False, 

1688) -> gpd.GeoDataFrame: 

1689 """ 

1690 Infer the geography level of the given dataframe. 

1691 

1692 Add geometry to each row for the inferred level. 

1693 

1694 See Also 

1695 -------- 

1696 :py:ref:`~infer_geo_level` for more on how inference is done. 

1697 

1698 Parameters 

1699 ---------- 

1700 df_data 

1701 A dataframe of variables with one or more columns that 

1702 can be used to infer what geometry level the rows represent. 

1703 year 

1704 The year for which to fetch geometries. We need this 

1705 because they change over time. If `None`, look for a 

1706 `'YEAR'` column in `df_data` and possibly add different 

1707 geometries for different years as needed. 

1708 with_geometry_columns 

1709 If `True` keep all the additional columns that come with shapefiles 

1710 downloaded to get geometry information. 

1711 tiger_shapefiles_only 

1712 If `True` only look for TIGER shapefiles. If `False`, first look 

1713 for CB shapefiles 

1714 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html), 

1715 which are more suitable for plotting maps, then fall back on the full 

1716 TIGER files 

1717 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) 

1718 only if CB is not available. This is mainly set to `True` only 

1719 when `with_geometry_columns` is also set to `True`. The reason 

1720 is that the additional columns in the shapefiles are different 

1721 in the CB files than in the TIGER files. 

1722 

1723 Returns 

1724 ------- 

1725 A geo data frame containing the original data augmented with 

1726 the appropriate geometry for each row. 

1727 """ 

1728 if year is None: 

1729 # We'll try to get the year out of the data. 

1730 if "YEAR" not in df_data.columns: 

1731 raise ValueError( 

1732 "If year is None then there must be a `YEAR` column in the data." 

1733 ) 

1734 

1735 return gpd.GeoDataFrame( 

1736 df_data.groupby("YEAR", group_keys=True, sort=False) 

1737 .apply( 

1738 lambda df_group: add_inferred_geography( 

1739 df_group, 

1740 df_group.name, 

1741 with_geometry_columns=with_geometry_columns, 

1742 tiger_shapefiles_only=tiger_shapefiles_only, 

1743 ), 

1744 include_groups=False, 

1745 ) 

1746 .reset_index(level=1, drop=True) 

1747 .reset_index(drop=False) 

1748 ) 

1749 

1750 geo_level = infer_geo_level(year, df_data) 

1751 

1752 ( 

1753 shapefile_scope, 

1754 _, 

1755 shapefile_scope_columns, 

1756 _, 

1757 ) = geo_query_from_data_query_inner_geo(year, geo_level) 

1758 

1759 if shapefile_scope is not None: 

1760 # The scope is the same across the board. 

1761 gdf = add_geography( 

1762 df_data, 

1763 year, 

1764 shapefile_scope, 

1765 geo_level, 

1766 with_geometry_columns=with_geometry_columns, 

1767 tiger_shapefiles_only=tiger_shapefiles_only, 

1768 ) 

1769 return gdf 

1770 

1771 # We have to group by different values of the shapefile 

1772 # scope from the appropriate column and add the right 

1773 # geography to each group. 

1774 shapefile_scope_column = shapefile_scope_columns[0] 

1775 

1776 df_with_geo = ( 

1777 df_data.set_index(shapefile_scope_column) 

1778 .groupby(level=shapefile_scope_column, group_keys=True, sort=False) 

1779 .apply( 

1780 lambda g: add_geography( 

1781 g, 

1782 year, 

1783 g.name, 

1784 geo_level, 

1785 with_geometry_columns=with_geometry_columns, 

1786 tiger_shapefiles_only=tiger_shapefiles_only, 

1787 ) 

1788 ) 

1789 .reset_index(level=1, drop=True) 

1790 .reset_index(drop=False) 

1791 ) 

1792 

1793 gdf = gpd.GeoDataFrame(df_with_geo) 

1794 

1795 return gdf 

1796 

1797 

1798certificates = censusdis.impl.fetch.certificates