Coverage for censusdis/data.py: 94%

2"""

3Utilities for loading census data.

5This module relies on the US Census API, which

6it wraps in a pythonic manner.

7"""

9import warnings

10from logging import getLogger

11from typing import (

12 Dict,

13 Iterable,

14 List,

15 Mapping,

16 Optional,

17 Tuple,

18 Union,

19)

21import io

22import requests

23import gzip

25import geopandas as gpd

26import numpy as np

27import pandas as pd

29import censusdis.geography as cgeo

30import censusdis.maps as cmap

31from censusdis.impl.exceptions import CensusApiException

32from censusdis.impl.fetch import data_from_url

33from censusdis.impl.us_census_shapefiles import (

34 add_geography,

35 clip_water,

36 infer_geo_level,

37 geo_query_from_data_query_inner_geo,

38)

39from censusdis.impl.varcache import VariableCache

40from censusdis.impl.varsource.base import VintageType

41from censusdis.impl.varsource.censusapi import CensusApiVariableSource

42from censusdis.values import ALL_SPECIAL_VALUES

43from censusdis.datasets import ACS5, DECENNIAL_PUBLIC_LAW_94_171

44from censusdis.states import ABBREVIATIONS_FROM_IDS

46import censusdis.impl.fetch

49logger = getLogger(__name__)

52GeoFilterType = Optional[Union[str, Iterable[str]]]

53"""

54The type we accept for geographic filters.

56They are used for the values of `kwargs` to

57:py:func:`download`.

59These filters are either single values as a string,

60or, if multivalued, then an iterable containing all

61the values allowed by the filter. For example::

63 import censusdis.data as ced

65 from censusdis.states import NJ, NY, CT

67 # Two different kinds of kwarg for `state=`, both of

68 # which are of `GeoFilterType`:

69 df_one_state = ced.download("aca/acs5", 2020, ["NAME"], state=NJ)

70 df_tri_state = ced.download("aca/acs5", 2020, ["NAME"], state=[NJ, NY, CT])

71"""

74def _gf2s(geo_filter: GeoFilterType) -> Optional[str]:

75 """

76 Convert a filter to a string.

78 For the Census API, multiple values are encoded

79 in a single comma separated string.

80 """

81 if geo_filter is None or isinstance(geo_filter, str):

82 return geo_filter

83 return ",".join(geo_filter)

86_MAX_VARIABLES_PER_DOWNLOAD = 50

87"""

88The maximum number of variables we can ask for in one census API query.

90The U.S. Census sets this limit, not us. In order to not expose our

91users to the limit, :py:func:`~download` mostly obscures the fact that

92requests to download more than this many variables are broken into

93multiple calls to the census API and then the results are stitched back

94together be either merging or concatenation. This is all handled in

95:py:func:`~_download_multiple`.

96"""

99__dw_strategy_metrics = {"merge": 0, "concat": 0}

100"""

101Counters for how often we use each strategy for wide tables.

102"""

103

104

105def _download_wide_strategy_metrics() -> Dict[str, int]:

106 """

107 Metrics on which strategies have been used for wide tables.

108

109 Returns

110 -------

111 A dictionary of metrics on how often each strategy has

112 been used.

113 """

114 return dict(**__dw_strategy_metrics)

115

116

117def _download_multiple(

118 dataset: str,

119 vintage: VintageType,

120 download_variables: List[str],

121 *,

122 query_filter: Dict[str, str],

123 api_key: Optional[str],

124 census_variables: "VariableCache",

125 with_geometry: bool,

126 with_geometry_columns: bool,

127 tiger_shapefiles_only: bool,

128 row_keys: Union[str, Iterable[str]],

129 **kwargs: cgeo.InSpecType,

130) -> pd.DataFrame:

131 """

132 Download data in groups of columns and concatenate the results together.

133

134 The reason for this function is that the API will only return a maximum

135 of 50 columns per query. This function downloads wider data 50 columns

136 at a time and concatenates them.

137

138 Parameters

139 ----------

140 dataset

141 The dataset to download from. For example `"acs/acs5"`,

142 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.

143 vintage

144 The vintage to download data for. For most data sets this is

145 an integer year, for example, `2020`. But for

146 a timeseries data set, pass the string `'timeseries'`.

147 download_variables

148 The census variables to download, for example `["NAME", "B01001_001E"]`.

149 query_filter

150 A dictionary of values to filter on. For example, if

151 `query_filter={'NAICS2017': '72251'}` then only rows

152 where the variable `NAICS2017` has a value of `'72251'`

153 will be returned.

154

155 This filtering is done on the server side, not the client

156 side, so it is far more efficient than querying without a

157 query filter and then manually filtering the results.

158 with_geometry

159 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row

160 will have a geometry that is a cartographic boundary suitable for platting

161 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

162 for details of the shapefiles that will be downloaded on your behalf to

163 generate these boundaries.

164 with_geometry_columns

165 If `True` keep all the additional columns that come with shapefiles

166 downloaded to get geometry information.

167 tiger_shapefiles_only

168 If `True` only look for TIGER shapefiles. If `False`, first look

169 for CB shapefiles

170 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),

171 which are more suitable for plotting maps, then fall back on the full

172 TIGER files

173 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)

174 only if CB is not available. This is mainly set to `True` only

175 when `with_geometry_columns` is also set to `True`. The reason

176 is that the additional columns in the shapefiles are different

177 in the CB files than in the TIGER files.

178 api_key

179 An optional API key. If you don't have or don't use a key, the number

180 of calls you can make will be limited.

181 variable_cache

182 A cache of metadata about variables.

183 row_keys

184 An optional set of identifier keys to help merge together requests for more than the census API limit of

185 50 variables per query. These keys are useful for census datasets such as the Current Population Survey

186 where the geographic identifiers do not uniquely identify each row.

187 kwargs

188 A specification of the geometry that we want data for.

189

190 Returns

191 -------

192 The full results of the query with all columns.

193

194 """

195 # Divide the variables into groups. If row keys are provided, include them in each chunk of variables,

196 # while respecting the variable max

197 if row_keys:

198 chunk_size = _MAX_VARIABLES_PER_DOWNLOAD - len(row_keys)

199 variable_groups = [

200 # black and flake8 disagree about the whitespace before ':' here...

201 # We need to drop duplicates in each chunk of variables

202 # since the row_key variables might already be present in one of the chunks

203 [

204 item

205 for item in row_keys

206 + download_variables[start : start + chunk_size] # noqa: E203

207 if item not in row_keys

208 or row_keys.index(item)

209 == (

210 row_keys

211 + download_variables[start : start + chunk_size] # noqa: E203

212 ).index(item)

213 ]

214 for start in range(0, len(download_variables), chunk_size)

215 ]

216 else:

217 variable_groups = [

218 # black and flake8 disagree about the whitespace before ':' here...

219 download_variables[start : start + _MAX_VARIABLES_PER_DOWNLOAD] # noqa: 203

220 for start in range(0, len(download_variables), _MAX_VARIABLES_PER_DOWNLOAD)

221 ]

222

223 if len(variable_groups) < 2:

224 raise ValueError(

225 "_download_multiple expects to be called with at least "

226 f"{_MAX_VARIABLES_PER_DOWNLOAD + 1} variables. With fewer,"

227 "use download instead."

228 )

229

230 # Get the data for each chunk. Note that we leave out

231 # extra geometry columns at this point. We will get them

232 # later if we need them, but they get in the way at this

233 # point.

234 dfs = [

235 download(

236 dataset,

237 vintage,

238 variable_group,

239 query_filter=query_filter,

240 api_key=api_key,

241 variable_cache=census_variables,

242 with_geometry=with_geometry and (ii == 0),

243 with_geometry_columns=False,

244 **kwargs,

245 )

246 for ii, variable_group in enumerate(variable_groups)

247 ]

248

249 # What variables came back in the first df but were not

250 # requested? These are a key to the geography the row

251 # represents. For example, 'STATE' amd 'COUNTY' might

252 # be these variables if we did a county-level query to

253 # the census API.

254 geo_key_variables = [f for f in dfs[0].columns if f not in set(variable_groups[0])]

255

256 # Now that we know the geometry keys, we may have to get back the other

257 # geometry columns we left out the first time.

258 if with_geometry and with_geometry_columns:

259 dfs[0] = download(

260 dataset,

261 vintage,

262 variable_groups[0],

263 query_filter=query_filter,

264 api_key=api_key,

265 variable_cache=census_variables,

266 with_geometry=with_geometry,

267 with_geometry_columns=with_geometry_columns,

268 tiger_shapefiles_only=tiger_shapefiles_only,

269 **kwargs,

270 )

271

272 # If we put in the geometry column, it's not part of the

273 # key.

274 if with_geometry:

275 geo_key_variables = [f for f in geo_key_variables if f != "geometry"]

276

277 # Now we have to decide if we are going to use the merge

278 # strategy or the concat strategy to combine the data frames

279 # we downloaded. Why do we have two strategies? Because we are

280 # dealing with two kinds of data. One kind, from data sets like

281 # ACS (https://www.census.gov/programs-surveys/acs.html),

282 # has a unique key of columns that specify geography. The other

283 # kind, from data sets like CPS

284 # (https://www.census.gov/programs-surveys/cps.html) doesn't.

285 #

286 # In the unique key case, we can join the data frames that come

287 # back on those key columns and get the final wide data frame

288 # we want for the user.

289 #

290 # In the non-unique key case, we can't do this. There data sets

291 # may have multiple rows for a value of the key columns. We can't

292 # join here. Instead, we can only concatenate the tables

293 # horizontally and hope that the rows came back in the same order

294 # for each of them.

295

296 # We hope to be able to merge. It is safer. If row_keys is supplied, they are included in

297 # merge keys

298 merge_strategy = True

299 if row_keys:

300 merge_keys = geo_key_variables + row_keys

301 else:

302 merge_keys = geo_key_variables

303 # But if there are any non-unique keys in any df, we can't

304 # merge.

305 for df_slice in dfs:

306 if len(df_slice.value_counts(merge_keys, sort=False)) != len(df_slice.index):

307 merge_strategy = False

308 break

309

310 if with_geometry and with_geometry_columns and not merge_strategy:

311 raise ValueError(

312 "`with_geometry_columns=True` is only supported for very wide results "

313 "when the merge_strategy can be used. This merge strategy is used when every "

314 "row of the result is for a unique geography, as is the case in data sets like "

315 f'ACS5 ("{ACS5}") and DECENNIAL_PUBLIC_LAW_94_171 ("{DECENNIAL_PUBLIC_LAW_94_171}"). '

316 "If this functionality is really important to you (note that it would create a lot "

317 "of duplicate geoemetry values, we suggest you set `with_geometry=False` in this call "

318 "and then merge with a `GeoDatFrame` with the geometries you want after the fact."

319 )

320

321 if merge_strategy:

322 # We can do the merge strategy.

323

324 __dw_strategy_metrics["merge"] = __dw_strategy_metrics["merge"] + 1

325

326 df_data = dfs[0]

327

328 for df_right in dfs[1:]:

329 df_data_columns = set(df_data.columns)

330 df_data = df_data.merge(

331 df_right[

332 [

333 col

334 for col in df_right.columns

335 if (col in merge_keys) or (col not in df_data_columns)

336 ]

337 ],

338 on=merge_keys,

339 )

340 else:

341 # We are going to have to fall back on the concat

342 # strategy. Before we do the concat, however, let's

343 # double-check that the key columns are the same in

344 # at the corresponding row in every df. Otherwise, something

345 # is fishy, and it is not safe to concat without mixing

346 # data that should be in different rows.

347

348 rows0 = len(dfs[0].index)

349

350 for df_slice in dfs[1:]:

351 if not (

352 rows0 == len(df_slice.index)

353 and dfs[0][geo_key_variables].equals(df_slice[geo_key_variables])

354 ):

355 # At least one difference. So we cannot use the

356 # concat strategy either.

357 if not row_keys:

358 raise CensusApiException(

359 "Neither the merge nor the concat strategy is viable. "

360 "We made multiple queries to the census API because more than "

361 f"{_MAX_VARIABLES_PER_DOWNLOAD} variables were requested. "

362 "If you don't need all the variables, it is always safer to "

363 f"download less than {_MAX_VARIABLES_PER_DOWNLOAD} variables. "

364 f"If you need more than {_MAX_VARIABLES_PER_DOWNLOAD}, you can supply the `row_keys`"

365 "arguement with a set of variables that uniquely identify each row."

366 )

367 else:

368 raise CensusApiException(

369 f"Neither the merge nor the concat strategy is viable using row_keys: {row_keys}. "

370 "The supplied keys should uniquely identify every row in the dataset to work. "

371 "If you don't need all the variables, it is always safer to "

372 f"download less than {_MAX_VARIABLES_PER_DOWNLOAD} variables. "

373 )

374

375 # Concat strategy is as safe as it will ever be. We hope the server

376 # side did not reorder the results across queries.

377 logger.info(

378 "Using the concat strategy, which is not guaranteed reliable if "

379 "the census API returned data for multiple sub-queries of less than "

380 "or equal to %d in different row orders. "

381 "It is always safest to query no more than %d "

382 "variables at a time. Please do so unless you really need them all.",

383 _MAX_VARIABLES_PER_DOWNLOAD,

384 _MAX_VARIABLES_PER_DOWNLOAD,

385 )

386

387 __dw_strategy_metrics["concat"] = __dw_strategy_metrics["concat"] + 1

388

389 df_data = pd.concat(

390 [dfs[0]] + [df.drop(geo_key_variables, axis="columns") for df in dfs[1:]],

391 axis="columns",

392 )

393

394 return df_data

395

396

397def download_lodes(

398 dataset: str,

399 vintage: VintageType,

400 download_variables: Optional[Union[str, Iterable[str]]] = None,

401 version: Optional[str] = None,

402 home_geography: Optional[Union[bool, Dict[str, str]]] = None,

403 with_geometry: bool = False,

404 with_geometry_columns: bool = False,

405 tiger_shapefiles_only: bool = False,

406 remove_water: bool = False,

407 **kwargs: cgeo.InSpecType,

408) -> Union[pd.DataFrame, gpd.GeoDataFrame]:

409 """

410 Download LODES data from the US Census API.

411

412 This is typically not called directly, but instead LODES data

413 is obtained by calling :py:func:`~download`, which then calls

414 this as needed for LODES data sets.

415

416 Parameters

417 ----------

418 dataset

419 The dataset to download from. For example `"acs/acs5"`,

420 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are

421 symbolic names for datasets, like `ACS5` for `"acs/acs5"

422 in :py:module:`censusdis.datasets`.

423 vintage

424 The vintage to download data for. For most data sets this is

425 an integer year, for example, `2020`. But for

426 a timeseries data set, pass the string `'timeseries'`.

427 download_variables

428 The census variables to download, for example `["NAME", "B01001_001E"]`.

429 with_geometry

430 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row

431 will have a geometry that is a cartographic boundary suitable for platting

432 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

433 for details of the shapefiles that will be downloaded on your behalf to

434 generate these boundaries.

435 with_geometry_columns

436 If `True` keep all the additional columns that come with shapefiles

437 downloaded to get geometry information.

438 tiger_shapefiles_only

439 If `True` only look for TIGER shapefiles. If `False`, first look

440 for CB shapefiles

441 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),

442 which are more suitable for plotting maps, then fall back on the full

443 TIGER files

444 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)

445 only if CB is not available. This is mainly set to `True` only

446 when `with_geometry_columns` is also set to `True`. The reason

447 is that the additional columns in the shapefiles are different

448 in the CB files than in the TIGER files.

449 remove_water

450 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and

451 remove water areas from returned geometry.

452 """

453 if version is None:

454 version = "LODES8"

455

456 if isinstance(home_geography, bool):

457 if home_geography:

458 home_geography = dict(**kwargs)

459 else:

460 home_geography = None

461

462 bound_path = cgeo.PathSpec.partial_prefix_match(dataset, vintage, **kwargs)

463 geo_bindings = bound_path.bindings

464

465 state = geo_bindings["state"]

466

467 if state == "*":

468 # TODO - we could just concatenate them all.

469 raise ValueError("Wildcards not supported for state LODES data.")

470

471 if state not in ABBREVIATIONS_FROM_IDS:

472 raise ValueError(f"Unknown state id {state}")

473

474 state_name = ABBREVIATIONS_FROM_IDS[state].lower()

475

476 _, data_set_type, part_or_segment, job_type = dataset.split("/")

477

478 if data_set_type in ["rac", "wac"]:

479 part_or_segment = part_or_segment.upper()

480

481 url = (

482 f"https://lehd.ces.census.gov/data/lodes/{version}/{state_name}/{data_set_type}/{state_name}_{data_set_type}_"

483 f"{part_or_segment}_{job_type.upper()}_{vintage}.csv.gz"

484 )

485

486 logger.info(f"Downloading LODES data from {url}")

487

488 results = requests.get(url)

489 if results.status_code != requests.status_codes.codes.OK:

490 raise CensusApiException(

491 f"Unable to get LODES data. Attempted to fetch from {url}. "

492 f"Status: {results.status_code}; {results.reason}"

493 )

494

495 gz_content = requests.get(url).content

496 content = gzip.decompress(gz_content)

497 df_lodes = pd.read_csv(

498 io.StringIO(content.decode("utf-8")), dtype={"w_geocode": str, "h_geocode": str}

499 )

500

501 # We don't need the date.

502 df_lodes = df_lodes.drop("createdate", axis="columns")

503

504 # Map the geographies to the conventions censusdis uses.

505

506 def map_geo_cols(*, from_prefix: str, to_suffix: str = ""):

507 df_lodes[f"STATE{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[:2]

508 df_lodes[f"COUNTY{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[2:5]

509 df_lodes[f"TRACT{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[5:11]

510 df_lodes[f"BLOCK{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[11:15]

511

512 if data_set_type in ["od", "wac"]:

513 map_geo_cols(from_prefix="w_")

514 else:

515 map_geo_cols(from_prefix="h_")

516

517 if data_set_type == "od":

518 map_geo_cols(from_prefix="h_", to_suffix="_H")

519

520 for geocode_col in ["w_geocode", "h_geocode"]:

521 if geocode_col in df_lodes.columns:

522 df_lodes.drop(geocode_col, axis="columns")

523

524 group_keys = []

525 selectors = {}

526

527 for geo, binding in geo_bindings.items():

528 group_keys.append(f"{geo.upper()}")

529 if binding != "*":

530 selectors[f"{geo.upper()}"] = binding

531

532 if data_set_type == "od":

533 if home_geography is None:

534 for col in df_lodes.columns:

535 if col.endswith("_H"):

536 group_keys.append(col)

537 else:

538 # There is more grouping to do.

539 home_bound_path = cgeo.PathSpec.partial_prefix_match(

540 dataset, vintage, **home_geography

541 )

542 home_geo_bindings = home_bound_path.bindings

543

544 for geo, binding in home_geo_bindings.items():

545 group_keys.append(f"{geo.upper()}_H")

546 if binding != "*":

547 selectors[f"{geo.upper()}_H"] = binding

548

549 # Filter down based on fixed bindings.

550 if selectors:

551 criteria = None

552 for col, binding in selectors.items():

553 if criteria is None:

554 criteria = df_lodes[col] == binding

555 else:

556 criteria = criteria & (df_lodes[col] == binding)

557 df_lodes = df_lodes[criteria]

558

559 if download_variables is None:

560 download_variables = [

561 col for col in df_lodes.columns if df_lodes[col].dtype != object

562 ]

563

564 # Group based on group keys.

565 df_lodes = df_lodes.groupby(group_keys)[download_variables].sum().reset_index()

566

567 if with_geometry:

568 # We need to get the geometry and merge it in.

569 geo_level = bound_path.path_spec.path[-1]

570 shapefile_scope = bound_path.bindings[bound_path.path_spec.path[0]]

571

572 gdf_data = add_geography(

573 df_lodes,

574 vintage,

575 shapefile_scope,

576 geo_level,

577 with_geometry_columns=with_geometry_columns,

578 tiger_shapefiles_only=tiger_shapefiles_only,

579 )

580

581 if remove_water:

582 gdf_data = clip_water(gdf_data, vintage)

584 return gdf_data

586 return df_lodes

589def download(

590 dataset: str,

591 vintage: VintageType,

592 download_variables: Optional[Union[str, Iterable[str]]] = None,

593 *,

594 group: Optional[Union[str, Iterable[str]]] = None,

595 leaves_of_group: Optional[Union[str, Iterable[str]]] = None,

596 set_to_nan: Union[bool, Iterable[int]] = True,

597 skip_annotations: bool = True,

598 query_filter: Optional[Dict[str, str]] = None,

599 with_geometry: bool = False,

600 with_geometry_columns: bool = False,

601 tiger_shapefiles_only: bool = False,

602 remove_water: bool = False,

603 download_contained_within: Optional[Dict[str, cgeo.InSpecType]] = None,

604 area_threshold: float = 0.8,

605 api_key: Optional[str] = None,

606 variable_cache: Optional["VariableCache"] = None,

607 row_keys: Optional[Union[str, Iterable[str]]] = None,

608 **kwargs: cgeo.InSpecType,

609) -> Union[pd.DataFrame, gpd.GeoDataFrame]:

610 """

611 Download data from the US Census API.

612

613 This is the main API for downloading US Census data with the

614 `censusdis` package. There are many examples of how to use

615 this in the demo notebooks provided with the package at

616 https://github.com/vengroff/censusdis/tree/main/notebooks.

617

618 *A note on variables and groups*: there are multiple ways to specify the

619 variables you want to download, either individually in `download_variables`,

620 by one or more groups in `group`, and by the leaves of one or more groups

621 in `leaves_of_group`. Note that these three sources af variables are

622 deduplicated, so you will only get one column for a variable no matter

623 how many times it is specified.

624

625 *Specifying census geographies*: censusdis provides access to many

626 census datasets, each of which can be retrieved at a particular set of

627 geographic grains. To accomodate this, `download()` takes a set

628 of kwargs to define the geographic level of the returned data. You can check

629 which geographies are available for a particular dataset with the

630 `geographies()`.

631

632 Parameters

633 ----------

634 dataset

635 The dataset to download from. For example `"acs/acs5"`,

636 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are

637 symbolic names for datasets, like `ACS5` for `"acs/acs5"`

638 in :py:module:`censusdis.datasets`.

639 vintage

640 The vintage to download data for. For most data sets this is

641 an integer year, for example, `2020`. But for

642 a timeseries data set, pass the string `'timeseries'`.

643 download_variables

644 The census variables to download, for example `["NAME", "B01001_001E"]`.

645 group

646 One or more groups (as defined by the U.S. Census for the data set)

647 whose variable values should be downloaded. These are in addition to

648 any specified in `download_variables`.

649 leaves_of_group

650 One or more groups (as defined by the U.S. Census for the data set)

651 whose leaf variable values should be downloaded.These are in addition to

652 any specified in `download_variables` or `group`. See

653 :py:meth:`VariableCache.group_leaves` for more details on the semantics of

654 leaves vs. non-leaf group variables.

655 set_to_nan

656 A list of values that should be set to NaN. Normally these are special

657 values that the U.S. Census API sometimes returns. If `True`, then all

658 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced.

659 If `False`, no replacements will be made.

660 skip_annotations

661 If `True` try to filter out `group` or `leaves_of_group` variables that are

662 annotations rather than actual values. See :py:meth:`VariableCache.group_variables`

663 for more details. Variable names passed in `download_variables` are not

664 affected by this flag.

665 query_filter

666 A dictionary of values to filter on. For example, if

667 `query_filter={'NAICS2017': '72251'}` then only rows

668 where the variable `NAICS2017` has a value of `'72251'`

669 will be returned.

670

671 This filtering is done on the server side, not the client

672 side, so it is far more efficient than querying without a

673 query filter and then manually filtering the results.

674 with_geometry

675 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row

676 will have a geometry that is a cartographic boundary suitable for platting

677 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

678 for details of the shapefiles that will be downloaded on your behalf to

679 generate these boundaries.

680 with_geometry_columns

681 If `True` keep all the additional columns that come with shapefiles

682 downloaded to get geometry information.

683 tiger_shapefiles_only

684 If `True` only look for TIGER shapefiles. If `False`, first look

685 for CB shapefiles

686 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),

687 which are more suitable for plotting maps, then fall back on the full

688 TIGER files

689 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)

690 only if CB is not available. This is mainly set to `True` only

691 when `with_geometry_columns` is also set to `True`. The reason

692 is that the additional columns in the shapefiles are different

693 in the CB files than in the TIGER files.

694 remove_water

695 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and

696 remove water areas from returned geometry.

697 download_contained_within

698 A dictionary specifying the geography or geographies that our results

699 should be filtered down to be contained within.

700 area_threshold

701 What fraction of the area of other geographies must be contained

702 in our geography to be included. Ignored if `download_contained_within` is

703 `None`.

704 api_key

705 An optional API key. If you don't have or don't use a key, the number

706 of calls you can make will be limited to 500 per day.

707 variable_cache

708 A cache of metadata about variables.

709 row_keys

710 An optional set of identifier keys to help merge together requests for more than the census API limit of

711 50 variables per query. These keys are useful for census datasets such as the Current Population Survey

712 where the geographic identifiers do not uniquely identify each row.

713 kwargs

714 A specification of the geometry that we want data for. For example,

715 `state = "*", county = "*"` will download county-level data for

716 the entire US.

717

718 Returns

719 -------

720 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data.

721 """

722 if dataset.startswith("lodes/"):

723 # Special case for the LODES data sets, which go down a completely

724 # different path.

725 if download_contained_within is not None:

726 raise ValueError(

727 "`download_contained_within` not supported for LODES data sets."

728 )

729

730 return download_lodes(

731 dataset,

732 vintage,

733 download_variables,

734 with_geometry=with_geometry,

735 with_geometry_columns=with_geometry_columns,

736 tiger_shapefiles_only=tiger_shapefiles_only,

737 remove_water=remove_water,

738 **kwargs,

739 )

740

741 if download_contained_within is not None:

742 # Put the contained_within context around it.

743 return contained_within(

744 area_threshold=area_threshold, **download_contained_within

745 ).download(

746 dataset,

747 vintage,

748 download_variables,

749 group=group,

750 leaves_of_group=leaves_of_group,

751 set_to_nan=set_to_nan,

752 skip_annotations=skip_annotations,

753 query_filter=query_filter,

754 with_geometry=with_geometry,

755 with_geometry_columns=with_geometry_columns,

756 tiger_shapefiles_only=tiger_shapefiles_only,

757 remove_water=remove_water,

758 api_key=api_key,

759 variable_cache=variable_cache,

760 row_keys=row_keys,

761 **kwargs,

762 )

763

764 if variable_cache is None:

765 variable_cache = variables

766

767 # Ensure list operations work

768 if row_keys:

769 row_keys = list(row_keys)

770

771 # The side effect here is to prime the cache.

772 cgeo.geo_path_snake_specs(dataset, vintage)

773

774 if set_to_nan is True:

775 set_to_nan = ALL_SPECIAL_VALUES

776

777 # In case they came to us in py format, as kwargs often do.

778 kwargs = {

779 cgeo.path_component_from_snake(dataset, vintage, k): v

780 for k, v in kwargs.items()

781 }

782

783 # Parse out the download variables

784 download_variables = _parse_download_variables(

785 dataset,

786 vintage,

787 download_variables=download_variables,

788 group=group,

789 leaves_of_group=leaves_of_group,

790 skip_annotations=skip_annotations,

791 variable_cache=variable_cache,

792 )

793

794 if len(download_variables) <= _MAX_VARIABLES_PER_DOWNLOAD and row_keys:

795 warnings.warn(

796 "\n The row_keys argument is intended to be used only when the number of requested"

797 "\n variables exceeds the Census defined limit of 50"

798 "\n The supplied value(s) will be ignored",

799 UserWarning,

800 )

801 # Special case if we are trying to get too many fields.

802 if len(download_variables) > _MAX_VARIABLES_PER_DOWNLOAD:

803 return _download_multiple(

804 dataset,

805 vintage,

806 download_variables,

807 api_key=api_key,

808 census_variables=variable_cache,

809 query_filter=query_filter,

810 with_geometry=with_geometry,

811 with_geometry_columns=with_geometry_columns,

812 tiger_shapefiles_only=tiger_shapefiles_only,

813 row_keys=row_keys,

814 **kwargs,

815 )

816

817 # Prefetch all the types before we load the data.

818 # That way we fail fast if a field is not known.

819 _prefetch_variable_types(dataset, vintage, download_variables, variable_cache)

820 # Also check that the row_keys, if supplied, are present in the dataset

821 if row_keys:

822 _prefetch_variable_types(dataset, vintage, row_keys, variable_cache)

823

824 # If we were given a list, join it together into

825 # a comma-separated string.

826 string_kwargs = {k: _gf2s(v) for k, v in kwargs.items()}

827

828 return _download_remote(

829 dataset,

830 vintage,

831 download_variables=download_variables,

832 set_to_nan=set_to_nan,

833 query_filter=query_filter,

834 with_geometry=with_geometry,

835 with_geometry_columns=with_geometry_columns,

836 tiger_shapefiles_only=tiger_shapefiles_only,

837 remove_water=remove_water,

838 api_key=api_key,

839 variable_cache=variable_cache,

840 **string_kwargs,

841 )

842

843

844def _download_remote(

845 dataset: str,

846 vintage: VintageType,

847 *,

848 download_variables: List[str],

849 set_to_nan: Union[bool, Iterable[float]] = True,

850 query_filter: Optional[Dict[str, str]] = None,

851 with_geometry: bool,

852 with_geometry_columns: bool,

853 tiger_shapefiles_only: bool,

854 remove_water: bool,

855 api_key: Optional[str],

856 variable_cache: "VariableCache",

857 **kwargs,

858) -> Union[pd.DataFrame, gpd.GeoDataFrame]:

859 """

860 Make the actual remote call to download the data.

861

862 This is the final step after we have parsed out and

863 validated the variables and geometry.

864

865 Parameters

866 ----------

867 dataset

868 The dataset to download from. For example `"acs/acs5"`,

869 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.

870 vintage

871 The vintage to download data for. For most data sets this is

872 an integer year, for example, `2020`. But for

873 a timeseries data set, pass the string `'timeseries'`.

874 download_variables

875 The census variables to download, for example `["NAME", "B01001_001E"]`.

876 set_to_nan

877 A list of values that should be set to NaN. Normally these are special

878 values that the U.S. Census API sometimes returns. If `True`, then all

879 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced.

880 If `False`, no replacements will be made.

881 query_filter

882 A dictionary of values to filter on. For example, if

883 `query_filter={'NAICS2017': '72251'}` then only rows

884 where the variable `NAICS2017` has a value of `'72251'`

885 will be returned.

886

887 This filtering is done on the server side, not the client

888 side, so it is far more efficient than querying without a

889 query filter and then manually filtering the results.

890 with_geometry

891 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row

892 will have a geometry that is a cartographic boundary suitable for platting

893 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

894 for details of the shapefiles that will be downloaded on your behalf to

895 generate these boundaries.

896 with_geometry_columns

897 If `True` keep all the additional columns that come with shapefiles

898 downloaded to get geometry information.

899 tiger_shapefiles_only

900 If `True` only look for TIGER shapefiles. If `False`, first look

901 for CB shapefiles

902 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),

903 which are more suitable for plotting maps, then fall back on the full

904 TIGER files

905 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)

906 only if CB is not available. This is mainly set to `True` only

907 when `with_geometry_columns` is also set to `True`. The reason

908 is that the additional columns in the shapefiles are different

909 in the CB files than in the TIGER files.

910 api_key

911 An optional API key. If you don't have or don't use a key, the number

912 of calls you can make will be limited.

913 variable_cache

914 A cache of metadata about variables.

915 kwargs

916 A specification of the geometry that we want data for.

917

918 Returns

919 -------

920 The downloaded variables, with or without added geometry, as

921 either a `pd.DataFrame` or `gpd.GeoDataFrame`.

922 """

923 url, params, bound_path = census_table_url(

924 dataset,

925 vintage,

926 download_variables,

927 query_filter=query_filter,

928 api_key=api_key,

929 **kwargs,

930 )

931 df_data = data_from_url(url, params)

932

933 # Coerce the types based on metadata about the variables.

934 _coerce_downloaded_variable_types(

935 dataset, vintage, download_variables, df_data, variable_cache

936 )

937

938 download_variables_upper = [dv.upper() for dv in download_variables]

939

940 # Put the geo fields (STATE, COUNTY, etc...) that came back up front.

941 df_data = df_data[

942 [col for col in df_data.columns if col not in download_variables_upper]

943 + download_variables_upper

944 ]

945

946 # NaN out as requested.

947 if set_to_nan is True:

948 set_to_nan = ALL_SPECIAL_VALUES

949 if set_to_nan:

950 df_data = df_data.replace(list(set_to_nan), np.nan)

951

952 if with_geometry:

953 # We need to get the geometry and merge it in.

954 geo_level = bound_path.path_spec.path[-1]

955 shapefile_scope = bound_path.bindings[bound_path.path_spec.path[0]]

956

957 gdf_data = add_geography(

958 df_data,

959 vintage,

960 shapefile_scope,

961 geo_level,

962 with_geometry_columns=with_geometry_columns,

963 tiger_shapefiles_only=tiger_shapefiles_only,

964 )

965

966 if remove_water:

967 gdf_data = clip_water(gdf_data, vintage)

969 return gdf_data

971 return df_data

974def _coerce_downloaded_variable_types(

975 dataset: str,

976 vintage: VintageType,

977 download_variables: List[str],

978 df_data: pd.DataFrame,

979 variable_cache: "VariableCache",

980) -> None:

981 """

982 Coerce the type of each returned variable (column) in a data frame.

983

984 We look up the type in the metadata in `variable_cache`.

985

986 Parameters

987 ----------

988 dataset

989 The dataset to download from. For example `"acs/acs5"`,

990 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.

991 vintage

992 The vintage to download data for. For most data sets this is

993 an integer year, for example, `2020`. But for

994 a timeseries data set, pass the string `'timeseries'`.

995 download_variables

996 The census variables to download, for example `["NAME", "B01001_001E"]`.

997 df_data

998 The data that came back in JSON form from the census API.

999 variable_cache

1000 A cache of metadata about variables.

1001 """

1002 for variable in download_variables:

1003 # predicateType does not exist in some older data sets like acs/acs3

1004 # So in that case we just go with what we got in the JSON. But if we

1005 # have it try to set the type.

1006 if "predicateType" in variable_cache.get(dataset, vintage, variable):

1007 field_type = variable_cache.get(dataset, vintage, variable)["predicateType"]

1008

1009 if field_type == "int" or field_type == "long":

1010 if df_data[variable].isnull().any():

1011 # Some Census data sets put in null in int fields.

1012 # We have to go with a float to make this a NaN.

1013 # Int has no representation for NaN or None.

1014 df_data[variable] = df_data[variable].astype(float, errors="ignore")

1015 else:

1016 try:

1017 df_data[variable] = df_data[variable].astype(int)

1018 except ValueError:

1019 # Sometimes census metadata says int, but they

1020 # put in float values anyway, so fall back on

1021 # trying to get them as floats.

1022 df_data[variable] = df_data[variable].astype(

1023 float, errors="ignore"

1024 )

1025 except OverflowError:

1026 # Some long IDs are actually better handled as strings.

1027 df_data[variable] = df_data[variable].astype(str)

1028 elif field_type == "float":

1029 df_data[variable] = df_data[variable].astype(float)

1030 elif field_type == "string":

1031 pass

1032 else:

1033 # Leave it as an object?

1034 pass

1035

1036

1037def _prefetch_variable_types(

1038 dataset: str,

1039 vintage: VintageType,

1040 download_variables: List[str],

1041 variable_cache: "VariableCache",

1042) -> None:

1043 """

1044 Prefetch the types of all the variables we are going to try to download.

1045

1046 This enables us to fail fast and have a better error message about the

1047 root cause of the issue than if we just blindly put in the variable names

1048 in the census API request and wait for it to fail.

1049

1050 Parameters

1051 ----------

1052 dataset

1053 The dataset to download from. For example `"acs/acs5"`,

1054 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.

1055 vintage

1056 The vintage to download data for. For most data sets this is

1057 an integer year, for example, `2020`. But for

1058 a timeseries data set, pass the string `'timeseries'`.

1059

1060 download_variables

1061 The census variables to download, for example `["NAME", "B01001_001E"]`.

1062 variable_cache

1063 A cache of metadata about variables.

1064 """

1065 for variable in download_variables:

1066 try:

1067 variable_cache.get(dataset, vintage, variable)

1068 except Exception as exc:

1069 census_url = CensusApiVariableSource.url(

1070 dataset, vintage, variable, response_format="html"

1071 )

1072 census_variables_url = CensusApiVariableSource.variables_url(

1073 dataset, vintage, response_format="html"

1074 )

1075

1076 raise CensusApiException(

1077 f"Unable to get metadata on the variable {variable} from the "

1078 f"dataset {dataset} for year {vintage} from the census API. "

1079 f"Check the census URL for the variable ({census_url}) to ensure it exists. "

1080 f"If not found, check {census_variables_url} for all variables in the dataset."

1081 ) from exc

1082

1083

1084def _parse_download_variables(

1085 dataset: str,

1086 vintage: VintageType,

1087 *,

1088 download_variables: Optional[Union[str, Iterable[str]]] = None,

1089 group: Optional[Union[str, Iterable[str]]] = None,

1090 leaves_of_group: Optional[Union[str, Iterable[str]]] = None,

1091 skip_annotations: bool = True,

1092 variable_cache: Optional["VariableCache"] = None,

1093) -> List[str]:

1094 """

1095 Parse out the full set of download variables.

1096

1097 These may be encoded in `download_variables`, `group`, and/or `leaves_of_group`.

1098

1099 Parameters

1100 ----------

1101 dataset

1102 The dataset to download from. For example `"acs/acs5"`,

1103 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.

1104 vintage

1105 The vintage to download data for. For most data sets this is

1106 an integer year, for example, `2020`. But for

1107 a timeseries data set, pass the string `'timeseries'`.

1108 download_variables

1109 The census variables to download, for example `["NAME", "B01001_001E"]`.

1110 group

1111 One or more groups (as defined by the U.S. Census for the data set)

1112 whose variable values should be downloaded. These are in addition to

1113 any specified in `download_variables`.

1114 leaves_of_group

1115 One or more groups (as defined by the U.S. Census for the data set)

1116 whose leaf variable values should be downloaded.These are in addition to

1117 any specified in `download_variables` or `group`. See

1118 :py:meth:`VariableCache.group_leaves` for more details on the semantics of

1119 leaves vs. non-leaf group variables.

1120 skip_annotations

1121 If `True` try to filter out `group` or `leaves_of_group` variables that are

1122 annotations rather than actual values. See :py:meth:`VariableCache.group_variables`

1123 for more details. Variable names passed in `download_variables` are not

1124 affected by this flag.

1125 variable_cache

1126 A cache of metadata about variables.

1127

1128 Returns

1129 -------

1130 The fully expanded list of variables to download.

1131 """

1132 # Turn the variables we were given into a list if they are not already.

1133 if download_variables is None:

1134 download_variables = []

1135 elif isinstance(download_variables, str):

1136 download_variables = [download_variables]

1137 elif not isinstance(download_variables, list):

1138 download_variables = list(download_variables)

1139

1140 if group is None:

1141 group = []

1142 elif isinstance(group, str):

1143 group = [group]

1144

1145 if leaves_of_group is None:

1146 leaves_of_group = []

1147 elif isinstance(leaves_of_group, str):

1148 leaves_of_group = [leaves_of_group]

1149

1150 # Add group variables and leaves as appropriate.

1151 group_variables: List[str] = []

1152 for group_name in group:

1153 group_variables = group_variables + variable_cache.group_variables(

1154 dataset, vintage, group_name, skip_annotations=skip_annotations

1155 )

1156 group_leaf_variables: List[str] = []

1157 for group_name in leaves_of_group:

1158 group_leaf_variables = group_leaf_variables + variable_cache.group_leaves(

1159 dataset, vintage, group_name, skip_annotations=skip_annotations

1160 )

1161

1162 # Concatenate them all.

1163 download_variables = download_variables + group_variables + group_leaf_variables

1164

1165 # Dedup and maintain order.

1166 download_variables = list(dict.fromkeys(download_variables))

1167

1168 return download_variables

1169

1170

1171def census_table_url(

1172 dataset: str,

1173 vintage: VintageType,

1174 download_variables: Iterable[str],

1175 *,

1176 query_filter: Optional[Dict[str, str]] = None,

1177 api_key: Optional[str] = None,

1178 **kwargs: cgeo.InSpecType,

1179) -> Tuple[str, Mapping[str, str], cgeo.BoundGeographyPath]:

1180 """

1181 Construct the URL to download data from the U.S. Census API.

1182

1183 Parameters

1184 ----------

1185 dataset

1186 The dataset to download from. For example `"acs/acs5"`,

1187 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.

1188 vintage

1189 The vintage to download data for. For most data sets this is

1190 an integer year, for example, `2020`. But for

1191 a timeseries data set, pass the string `'timeseries'`.

1192 download_variables

1193 The census variables to download, for example `["NAME", "B01001_001E"]`.

1194 query_filter

1195 A dictionary of values to filter on. For example, if

1196 `query_filter={'NAICS2017': '72251'}` then only rows

1197 where the variable `NAICS2017` has a value of `'72251'`

1198 will be returned.

1199

1200 This filtering is done on the server side, not the client

1201 side, so it is far more efficient than querying without a

1202 query filter and then manually filtering the results.

1203 api_key

1204 An optional API key. If you don't have or don't use a key, the number

1205 of calls you can make will be limited.

1206 kwargs

1207 A specification of the geometry that we want data for.

1208

1209 Returns

1210 -------

1211 The URL, parameters and bound path.

1212

1213 """

1214 bound_path = _bind_path_if_possible(dataset, vintage, **kwargs)

1215

1216 query_spec = cgeo.CensusGeographyQuerySpec(

1217 dataset, vintage, list(download_variables), bound_path, api_key=api_key

1218 )

1219

1220 url, params = query_spec.table_url(query_filter=query_filter)

1221

1222 return url, params, bound_path

1223

1224

1225def _bind_path_if_possible(dataset, vintage, **kwargs):

1226 """

1227 Bind the path if possible.

1228

1229 If not, raise an exception with enough info to fix it.

1230 """

1231 bound_path = cgeo.PathSpec.partial_prefix_match(dataset, vintage, **kwargs)

1232 if bound_path is None:

1233 if kwargs:

1234 path_specs = cgeo.geo_path_snake_specs(dataset, vintage)

1235 possible_path_spec_keys = set(

1236 geo for path_name in path_specs.values() for geo in path_name

1237 )

1238

1239 non_geo_kwargs = [

1240 k for k in kwargs.keys() if k not in possible_path_spec_keys

1241 ]

1242

1243 if non_geo_kwargs:

1244 msg = f"""

1245The following arguments are not recognized as non-geographic arguments or goegraphic arguments

1246for the dataset {dataset} in vintage {vintage}: '{"', '".join(non_geo_kwargs)}'.

1247

1248There are two reasons why this might happen:

1249

12501. The arg(s) mentioned above are mispelled versions of named or geopgrahic arguments.

12512. The arg(s) mentioned above are valid geographic arguments for some data sets and

1252 vintages, but not for {dataset} in vintage {vintage}.

1253

1254"""

1255 else:

1256 msg = f"""

1257Unable to match the geography specification {kwargs}.

1258

1259"""

1260

1261 raise CensusApiException(

1262 f"{msg}"

1263 f"Supported geographies for dataset='{dataset}' in year={vintage} are:\n"

1264 + "\n".join(

1265 f"{path_spec}"

1266 for path_spec in cgeo.geo_path_snake_specs(

1267 dataset, vintage

1268 ).values()

1269 )

1270 )

1271 else:

1272 bound_path = cgeo.BoundGeographyPath("000", cgeo.PathSpec.empty_path_spec())

1273

1274 return bound_path

1275

1276

1277def geography_names(

1278 dataset: str,

1279 vintage: VintageType,

1280 **kwargs: cgeo.InSpecType,

1281) -> pd.DataFrame:

1282 """

1283 Get the name of a specific geography.

1284

1285 The arguments are a subset of those to :py:func:`~download`. This

1286 function is designed to make it easy to fetch the name of a geography

1287 when we know the FIPS code but want a human-readable name or label for

1288 display.

1289

1290 Parameters

1291 ----------

1292 dataset

1293 The dataset to download from. For example `censusdis.datasets.ACS5`.

1294 vintage

1295 The vintage to download data for. For example, `2020`.

1296 kwargs

1297 A specification of the geometry that we want data for. For example,

1298 `state = "34", county = "017"` will download the name of Hudson County,

1299 New Jersey.

1300

1301 Returns

1302 -------

1303 A dataframe with columns specifying the geography and one for the name.

1304 All column names will be in ALL CAPS.

1305 """

1306 df = download(dataset, vintage, ["NAME"], **kwargs)

1307

1308 return df

1309

1310

1311def geographies(dataset: str, vintage: VintageType) -> List[List[str]]:

1312 """

1313 Determine what geographies are supported for a dataset and vintage.

1314

1315 This utility gives us a list of the different geography

1316 keywords we can use in calls to :py:func:`download` with

1317 for the given dataset and vintage.

1318

1319 Parameters

1320 ----------

1321 dataset

1322 The dataset to download from. For example `"acs/acs5"`,

1323 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.

1324 vintage

1325 The vintage to download data for. For most data sets this is

1326 an integer year, for example, `2020`. But for

1327 a timeseries data set, pass the string `'timeseries'`.

1328

1329 Returns

1330 -------

1331 A list of lists of geography keywords. Each element

1332 of the outer list is a list of keywords that can be

1333 used together.

1334 """

1335 return list(cgeo.geo_path_snake_specs(dataset, vintage).values())

1336

1337

1338variables = VariableCache()

1339

1340

1341def _intersecting_geos_kws(

1342 dataset: str,

1343 vintage: VintageType,

1344 containing_geo_kwargs: cgeo.InSpecType,

1345 **kwargs: cgeo.InSpecType,

1346) -> cgeo.InSpecType:

1347 """

1348 Construct geography keywords for intersecting geographies.

1349

1350 Parameters

1351 ----------

1352 dataset

1353 The dataset to download from. For example `"acs/acs5"`,

1354 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are

1355 symbolic names for datasets, like `ACS5` for `"acs/acs5"

1356 in :py:module:`censusdis.datasets`.

1357 vintage

1358 The vintage to download data for. For most data sets this is

1359 an integer year, for example, `2020`. But for

1360 a timeseries data set, pass the string `'timeseries'`.

1361 containing_geo_kwargs

1362 Geographic keywords specifying the containing geography that we are

1363 looking for intersections with. For example

1364 `dict(metropolitan_statistical_area_micropolitan_statistical_area="35620")`

1365 for the New York area CBSA.

1366 kwargs

1367 A specification of the geometry that we want data for, limited to those

1368 geographies that are contained in the geography specified by `containing_geo_kwargs`.

1369 For example, `state="*", county="*", tract="*"` will specifies county-level data for

1370 all counties contained in the containing geography.

1371

1372 Returns

1373 -------

1374 A dictionary of geographic keywords suitable for passing to :py:func:`~download`.

1375 """

1376 # This is a fast short circuit if there is only one

1377 # element of kwargs or the first component

1378 # is already specified. The former is since we will have to

1379 # query with the kwargs as they are, so we might as well

1380 # just let our caller do it. The second case is because

1381 # we might trim down the list, but more likely the user

1382 # double specified at the top level, like state=.

1383 if len(kwargs) == 1 or list(kwargs.values())[0] != "*":

1384 return kwargs

1385

1386 # Download the geometry of the outer scope.

1387 gdf_within = download(

1388 dataset, vintage, ["NAME"], with_geometry=True, **containing_geo_kwargs

1389 )

1390

1391 # See if we can find a matching path spec.

1392 bound_path = _bind_path_if_possible(dataset, vintage, **kwargs)

1393

1394 # Get the geography for the outermost level of the match.

1395 first_binding = list(bound_path.bindings.items())[0]

1396

1397 containing_geo_kwargs = {first_binding[0]: first_binding[1]}

1398

1399 gdf_first_binding = download(

1400 dataset, vintage, ["NAME"], with_geometry=True, **containing_geo_kwargs

1401 )

1402

1403 # Which of the first binding geographies intersect

1404 # the area we want our final geographies to be in.

1405 gdf_intersects = gdf_first_binding.sjoin(

1406 gdf_within, lsuffix="FIRST", rsuffix="within"

1407 )

1408

1409 col_name = first_binding[0].replace(" ", "_").upper()

1410 if col_name not in gdf_intersects.columns:

1411 col_name = f"{col_name}_FIRST"

1412

1413 intersecting_geographies = list(gdf_intersects[col_name].unique())

1414

1415 # Short circuit if there are a massive number of intersection

1416 # geps. In this case, we'll just leave things as they came with

1417 # the leading '*' and query them all. Otherwise the URL gets super

1418 # long and things go a little crazy. This can happen with zip code

1419 # tabulation areas.

1420 if len(intersecting_geographies) > 20:

1421 return dict(**kwargs)

1422

1423 intersecting_geographies = [

1424 geo[:-6] if geo.endswith("_FIRST") else geo for geo in intersecting_geographies

1425 ]

1426

1427 geo = dict(bound_path.bindings)

1428

1429 geo[first_binding[0]] = intersecting_geographies

1430

1431 return geo

1432

1433

1434class ContainedWithin:

1435 """A representation of a geography that we want to query some other geographies that are contained within."""

1436

1437 def __init__(self, area_threshold: float = 0.8, **kwargs: cgeo.InSpecType):

1438 """

1439 Construct a representation of a geography that we want to query some other geographies contained within.

1440

1441 Parameters

1442 ----------

1443 area_threshold

1444 What fraction of the area of other geographies must be contained

1445 in our geography to be included.

1446 kwargs

1447 A specification of the geometry that we want data for geometries

1448 that are contained within. For example,

1449 `state = "NJ", place = "01960"` will specify the city of Asbury Park, NJ.

1450 """

1451 self._area_threshold = area_threshold

1452 self._containing_kwargs = kwargs

1453

1454 def __eq__(self, other) -> bool:

1455 """Are two objects equal."""

1456 if not isinstance(other, ContainedWithin):

1457 return False

1458

1459 return (

1460 self._area_threshold == other._area_threshold

1461 and self._containing_kwargs == other._containing_kwargs

1462 )

1463

1464 def __enter__(self) -> "ContainedWithin":

1465 """

1466 Enter the context.

1467

1468 Returns

1469 -------

1470 The ContainedWithin object for use within the context.

1471 """

1472 return self

1473

1474 def __exit__(self, exc_type, exc_val, exc_tb) -> None:

1475 """Exit the context."""

1476 pass

1477

1478 def download(

1479 self,

1480 dataset: str,

1481 vintage: VintageType,

1482 download_variables: Optional[Union[str, Iterable[str]]] = None,

1483 *,

1484 group: Optional[Union[str, Iterable[str]]] = None,

1485 leaves_of_group: Optional[Union[str, Iterable[str]]] = None,

1486 set_to_nan: Union[bool, Iterable[int]] = True,

1487 skip_annotations: bool = True,

1488 query_filter: Optional[Dict[str, str]] = None,

1489 with_geometry: bool = False,

1490 with_geometry_columns: bool = False,

1491 tiger_shapefiles_only: bool = False,

1492 remove_water: bool = False,

1493 api_key: Optional[str] = None,

1494 variable_cache: Optional["VariableCache"] = None,

1495 row_keys: Optional[Union[str, Iterable[str]]] = None,

1496 **kwargs: cgeo.InSpecType,

1497 ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:

1498 """

1499 Download data for geographies contained within a containing geography.

1500

1501 Parameters

1502 ----------

1503 dataset

1504 The dataset to download from. For example `"acs/acs5"`,

1505 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are

1506 symbolic names for datasets, like `ACS5` for `"acs/acs5"

1507 in :py:module:`censusdis.datasets`.

1508 vintage

1509 The vintage to download data for. For most data sets this is

1510 an integer year, for example, `2020`. But for

1511 a timeseries data set, pass the string `'timeseries'`.

1512 download_variables

1513 The census variables to download, for example `["NAME", "B01001_001E"]`.

1514 group

1515 One or more groups (as defined by the U.S. Census for the data set)

1516 whose variable values should be downloaded. These are in addition to

1517 any specified in `download_variables`.

1518 leaves_of_group

1519 One or more groups (as defined by the U.S. Census for the data set)

1520 whose leaf variable values should be downloaded.These are in addition to

1521 any specified in `download_variables` or `group`. See

1522 :py:meth:`VariableCache.group_leaves` for more details on the semantics of

1523 leaves vs. non-leaf group variables.

1524 set_to_nan

1525 A list of values that should be set to NaN. Normally these are special

1526 values that the U.S. Census API sometimes returns. If `True`, then all

1527 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced.

1528 If `False`, no replacements will be made.

1529 skip_annotations

1530 If `True` try to filter out `group` or `leaves_of_group` variables that are

1531 annotations rather than actual values. See :py:meth:`VariableCache.group_variables`

1532 for more details. Variable names passed in `download_variables` are not

1533 affected by this flag.

1534 query_filter

1535 A dictionary of values to filter on. For example, if

1536 `query_filter={'NAICS2017': '72251'}` then only rows

1537 where the variable `NAICS2017` has a value of `'72251'`

1538 will be returned.

1539

1540 This filtering is done on the server side, not the client

1541 side, so it is far more efficient than querying without a

1542 query filter and then manually filtering the results.

1543 with_geometry

1544 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row

1545 will have a geometry that is a cartographic boundary suitable for platting

1546 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

1547 for details of the shapefiles that will be downloaded on your behalf to

1548 generate these boundaries.

1549 with_geometry_columns

1550 If `True` keep all the additional columns that come with shapefiles

1551 downloaded to get geometry information.

1552 tiger_shapefiles_only

1553 If `True` only look for TIGER shapefiles. If `False`, first look

1554 for CB shapefiles

1555 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),

1556 which are more suitable for plotting maps, then fall back on the full

1557 TIGER files

1558 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)

1559 only if CB is not available. This is mainly set to `True` only

1560 when `with_geometry_columns` is also set to `True`. The reason

1561 is that the additional columns in the shapefiles are different

1562 in the CB files than in the TIGER files.

1563 remove_water

1564 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and

1565 remove water areas from returned geometry.

1566 api_key

1567 An optional API key. If you don't have or don't use a key, the number

1568 of calls you can make will be limited to 500 per day.

1569 variable_cache

1570 A cache of metadata about variables.

1571 row_keys

1572 An optional set of identifier keys to help merge together requests for more than the census API limit of

1573 50 variables per query. These keys are useful for census datasets such as the Current Population Survey

1574 where the geographic identifiers do not uniquely identify each row.

1575 kwargs

1576 A specification of the geometry that we want data for. For example,

1577 `state = "*", county = "*"` will download county-level data for

1578 the entire US.

1579

1580 Returns

1581 -------

1582 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data.

1583 """

1584 geos_kwargs = _intersecting_geos_kws(

1585 dataset, vintage, self._containing_kwargs, **kwargs

1586 )

1587

1588 gdf = download(

1589 dataset,

1590 vintage,

1591 download_variables,

1592 group=group,

1593 leaves_of_group=leaves_of_group,

1594 set_to_nan=set_to_nan,

1595 skip_annotations=skip_annotations,

1596 query_filter=query_filter,

1597 with_geometry=True,

1598 with_geometry_columns=with_geometry_columns,

1599 tiger_shapefiles_only=tiger_shapefiles_only,

1600 remove_water=remove_water,

1601 api_key=api_key,

1602 variable_cache=variable_cache,

1603 row_keys=row_keys,

1604 **geos_kwargs,

1605 )

1606

1607 # See which of these geometries are mostly contained by

1608 # the geography we want to be within.

1609

1610 gdf_container = download(

1611 dataset, vintage, ["NAME"], with_geometry=True, **self._containing_kwargs

1612 ).drop("NAME", axis="columns")

1613

1614 gdf_contained = cmap.sjoin_mostly_contains(

1615 gdf_container, gdf, area_threshold=self._area_threshold

1616 )

1617

1618 # Drop all the large container columns we don't need.

1619 gdf_contained = gdf_contained[

1620 [col for col in gdf_contained.columns if not col.endswith("_large")]

1621 ].reset_index(drop=True)

1622

1623 # Drop the "_small" suffix.

1624

1625 gdf_contained.rename(

1626 lambda col: col[:-6] if col.endswith("_small") else col,

1627 axis="columns",

1628 inplace=True,

1629 )

1630

1631 if with_geometry:

1632 # Keep the columns from the larger result.

1633 return gdf_contained[

1634 [

1635 col

1636 for col in gdf_container.columns

1637 if col in gdf_contained.columns and col != "geometry"

1638 ]

1639 + [

1640 col

1641 for col in gdf_contained.columns

1642 if col not in gdf_container.columns or col == "geometry"

1643 ]

1644 ]

1645 else:

1646 # Drop the geometry and return a `pd.DataFrame`

1647 return pd.DataFrame(

1648 gdf_contained[

1649 [

1650 col

1651 for col in gdf_container.columns

1652 if col in gdf_contained.columns and col != "geometry"

1653 ]

1654 + [

1655 col

1656 for col in gdf_contained.columns

1657 if col not in gdf_container.columns and col != "geometry"

1658 ]

1659 ]

1660 )

1661

1662

1663def contained_within(

1664 area_threshold: float = 0.8, **kwargs: cgeo.InSpecType

1665) -> ContainedWithin:

1666 """

1667 Construct a representation of a geography that we want to query some other geographies contained within.

1668

1669 Parameters

1670 ----------

1671 area_threshold

1672 What fraction of the area of other geographies must be contained

1673 in our geography to be included.

1674 kwargs

1675 A specification of the geometry that we want data for geometries

1676 that are contained within. For example,

1677 `state = "NJ", place = "01960"` will specify the city of Asbury Park, NJ.

1678 """

1679 return ContainedWithin(area_threshold=area_threshold, **kwargs)

1680

1681

1682def add_inferred_geography(

1683 df_data: pd.DataFrame,

1684 year: Optional[int] = None,

1685 *,

1686 with_geometry_columns: bool = False,

1687 tiger_shapefiles_only: bool = False,

1688) -> gpd.GeoDataFrame:

1689 """

1690 Infer the geography level of the given dataframe.

1691

1692 Add geometry to each row for the inferred level.

1693

1694 See Also

1695 --------

1696 :py:ref:`~infer_geo_level` for more on how inference is done.

1697

1698 Parameters

1699 ----------

1700 df_data

1701 A dataframe of variables with one or more columns that

1702 can be used to infer what geometry level the rows represent.

1703 year

1704 The year for which to fetch geometries. We need this

1705 because they change over time. If `None`, look for a

1706 `'YEAR'` column in `df_data` and possibly add different

1707 geometries for different years as needed.

1708 with_geometry_columns

1709 If `True` keep all the additional columns that come with shapefiles

1710 downloaded to get geometry information.

1711 tiger_shapefiles_only

1712 If `True` only look for TIGER shapefiles. If `False`, first look

1713 for CB shapefiles

1714 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),

1715 which are more suitable for plotting maps, then fall back on the full

1716 TIGER files

1717 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)

1718 only if CB is not available. This is mainly set to `True` only

1719 when `with_geometry_columns` is also set to `True`. The reason

1720 is that the additional columns in the shapefiles are different

1721 in the CB files than in the TIGER files.

1722

1723 Returns

1724 -------

1725 A geo data frame containing the original data augmented with

1726 the appropriate geometry for each row.

1727 """

1728 if year is None:

1729 # We'll try to get the year out of the data.

1730 if "YEAR" not in df_data.columns:

1731 raise ValueError(

1732 "If year is None then there must be a `YEAR` column in the data."

1733 )

1734

1735 return gpd.GeoDataFrame(

1736 df_data.groupby("YEAR", group_keys=True, sort=False)

1737 .apply(

1738 lambda df_group: add_inferred_geography(

1739 df_group,

1740 df_group.name,

1741 with_geometry_columns=with_geometry_columns,

1742 tiger_shapefiles_only=tiger_shapefiles_only,

1743 ),

1744 include_groups=False,

1745 )

1746 .reset_index(level=1, drop=True)

1747 .reset_index(drop=False)

1748 )

1749

1750 geo_level = infer_geo_level(year, df_data)

1751

1752 (

1753 shapefile_scope,

1754 _,

1755 shapefile_scope_columns,

1756 _,

1757 ) = geo_query_from_data_query_inner_geo(year, geo_level)

1758

1759 if shapefile_scope is not None:

1760 # The scope is the same across the board.

1761 gdf = add_geography(

1762 df_data,

1763 year,

1764 shapefile_scope,

1765 geo_level,

1766 with_geometry_columns=with_geometry_columns,

1767 tiger_shapefiles_only=tiger_shapefiles_only,

1768 )

1769 return gdf

1770

1771 # We have to group by different values of the shapefile

1772 # scope from the appropriate column and add the right

1773 # geography to each group.

1774 shapefile_scope_column = shapefile_scope_columns[0]

1775

1776 df_with_geo = (

1777 df_data.set_index(shapefile_scope_column)

1778 .groupby(level=shapefile_scope_column, group_keys=True, sort=False)

1779 .apply(

1780 lambda g: add_geography(

1781 g,

1782 year,

1783 g.name,

1784 geo_level,

1785 with_geometry_columns=with_geometry_columns,

1786 tiger_shapefiles_only=tiger_shapefiles_only,

1787 )

1788 )

1789 .reset_index(level=1, drop=True)

1790 .reset_index(drop=False)

1791 )

1792

1793 gdf = gpd.GeoDataFrame(df_with_geo)

1794

1795 return gdf

1796

1797

1798certificates = censusdis.impl.fetch.certificates