Coverage for censusdis/impl/us_census_shapefiles.py: 97%
119 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
1# Copyright (c) 2023 Darren Erik Vengroff
2"""
3Details of how U.S. Census shapefiles and their contents are named.
5Also includes utilities for managing downloaded shapefiles and
6for water clipping.
7"""
9from dataclasses import dataclass
10from pathlib import Path
11from typing import Dict, Callable, Tuple, Optional, List, Generator, Union
12from logging import getLogger
14import geopandas as gpd
15import pandas as pd
17from censusdis import CensusApiException, maps as cmap
18from censusdis.impl.geometry import drop_slivers_from_gdf
19from censusdis.impl.varsource.base import VintageType
22logger = getLogger(__name__)
25_GEO_QUERY_FROM_DATA_QUERY_INNER_GEO: Dict[
26 str,
27 Callable[[int], Tuple[Optional[str], str, List[str], List[str]]],
28] = {
29 # innermost geo: ( shapefile_scope, shapefile_geo_name, df_on, gdf_on )
30 "us": lambda year: ("us", "nation", ["US"], ["US"]),
31 "region": lambda year: ("us", "region", ["REGION"], ["REGIONCE"]),
32 "division": lambda year: ("us", "division", ["DIVISION"], ["DIVISIONCE"]),
33 "combined statistical area": lambda year: (
34 "us",
35 "csa",
36 ["COMBINED_STATISTICAL_AREA"],
37 ["CSAFP"],
38 ),
39 "metropolitan statistical area/micropolitan statistical area": lambda year: (
40 "us",
41 "cbsa",
42 ["METROPOLITAN_STATISTICAL_AREA_MICROPOLITAN_STATISTICAL_AREA"],
43 ["CBSAFP"],
44 ),
45 "state": lambda year: ("us", "state", ["STATE"], ["STATEFP"]),
46 "consolidated city": lambda year: (
47 "us",
48 "concity",
49 ["STATE", "CONSOLIDATED_CITY"],
50 ["STATEFP", "CONCTYFP"],
51 ),
52 "county": lambda year: (
53 "us",
54 "county",
55 ["STATE", "COUNTY"],
56 ["STATEFP", "COUNTYFP"],
57 ),
58 "public use microdata area": lambda year: (
59 "us",
60 "puma10" if year < 2020 else "puma20",
61 ["STATE", "PUBLIC_USE_MICRODATA_AREA"],
62 ["STATEFP", "PUMACE"] if year < 2020 else ["STATEFP20", "PUMACE20"],
63 ),
64 "congressional district": lambda year: (
65 "us",
66 _congressional_district_from_year(year),
67 ["STATE", "CONGRESSIONAL_DISTRICT"],
68 ["STATEFP", f"{_congressional_district_from_year(year).upper()}FP"],
69 ),
70 "zip code tabulation area": lambda year: (
71 "us",
72 "zcta520" if year >= 2020 else "zcta510",
73 ["ZIP_CODE_TABULATION_AREA"],
74 ["ZCTA5CE10" if year < 2020 else "ZCTA5CE20" if year == 2020 else "ZCTA5CE"],
75 ),
76 "american indian area/alaska native area/hawaiian home land": lambda year: (
77 "us",
78 "aiannh",
79 ["AMERICAN_INDIAN_AREA_ALASKA_NATIVE_AREA_HAWAIIAN_HOME_LAND"],
80 ["AIANNHCE"],
81 ),
82 "new england city and town area": lambda year: (
83 "us",
84 "necta",
85 ["NEW_ENGLAND_CITY_AND_TOWN_AREA"],
86 ["NECTAFP"],
87 ),
88 # For these, the shapefiles are at the state level, so `None`
89 # indicates that we have to fill it in based on the geometry
90 # being queried.
91 "county subdivision": lambda year: (
92 None,
93 "cousub",
94 ["STATE", "COUNTY_SUBDIVISION"],
95 ["STATEFP", "COUSUB" if year == 2010 else "COUSUBFP"],
96 ),
97 "place": lambda year: (None, "place", ["STATE", "PLACE"], ["STATEFP", "PLACEFP"]),
98 "tract": lambda year: (
99 None,
100 "tract",
101 ["STATE", "COUNTY", "TRACT"],
102 ["STATEFP", "COUNTYFP", "TRACTCE"],
103 ),
104 "block group": lambda year: (
105 None,
106 "bg",
107 ["STATE", "COUNTY", "TRACT", "BLOCK_GROUP"],
108 ["STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE"],
109 ),
110 "block": lambda year: (
111 None,
112 "tabblock",
113 ["STATE", "COUNTY", "TRACT", "BLOCK"],
114 ["STATEFP", "COUNTYFP", "TRACTCE", "BLOCKCE"],
115 ),
116 "school district (unified)": lambda year: (
117 None,
118 "unsd",
119 ["STATE", "SCHOOL_DISTRICT_UNIFIED"],
120 ["STATEFP", "UNSDLEA"],
121 ),
122 "school district (elementary)": lambda year: (
123 # CB files from 2016 on exist with the "elsd" name and "ELSDLEA" column.
124 # Earlier tiger files use the name "sda" and "ESDLEA" column.
125 None,
126 "elsd" if year >= 2016 else "sde",
127 ["STATE", "SCHOOL_DISTRICT_ELEMENTARY"],
128 ["STATEFP", "ELSDLEA" if year >= 2016 else "ESDLEA"],
129 ),
130 "school district (secondary)": lambda year: (
131 # Similar scenario to school district (secondary).
132 None,
133 "scsd" if year >= 2016 else "sde",
134 ["STATE", "SCHOOL_DISTRICT_SECONDARY"],
135 ["STATEFP", "SCSDLEA" if year >= 2016 else "SSDLEA"],
136 ),
137 "state legislative district (upper chamber)": lambda year: (
138 None,
139 "sldu",
140 ["STATE", "STATE_LEGISLATIVE_DISTRICT_UPPER_CHAMBER"],
141 ["STATEFP", "SLDUST"],
142 ),
143 "state legislative district (lower chamber)": lambda year: (
144 None,
145 "sldl",
146 ["STATE", "STATE_LEGISLATIVE_DISTRICT_LOWER_CHAMBER"],
147 ["STATEFP", "SLDLST"],
148 ),
149 "voting district": lambda year: (
150 None,
151 "vtd",
152 ["STATE", "COUNTY", "VOTING_DISTRICT"],
153 (
154 ["STATEFP20", "COUNTYFP20", "VTDST20"]
155 if year >= 2020
156 else ["STATEFP10", "COUNTYFP10", "VTDST10"]
157 ),
158 ),
159 # This one could be a little dangerous if subminor civil
160 # divisions exist in states and are not mapped as subbarios.
161 # It appears the only example of this is the ESTATE in the
162 # USVI. So we are going to punt for the moment and deal with
163 # it at some future time, probably by adding an arg to the
164 # lambdas to take in the bound params so we can look at them.
165 #
166 # Noted in https://github.com/vengroff/censusdis/issues/223.
167 "subminor civil division": lambda year: (
168 None,
169 "subbarrio",
170 ["STATE", "COUNTY", "COUNTY_SUBDIVISION", "SUBMINOR_CIVIL_DIVISION"],
171 ["STATEFP", "COUNTYFP", "COUSUBFP", "SUBMCDFP"],
172 ),
173 "alaska native regional corporation": lambda year: (
174 None,
175 "anrc",
176 ["ALASKA_NATIVE_REGIONAL_CORPORATION"],
177 ["ANRCFP"],
178 ),
179}
180"""
181Helper map for the _with_geometry case.
183A map from the innermost level of a geometry specification
184to the arguments we need to pass to `get_cb_shapefile`
185to get the right shapefile for the geography and the columns
186we need to join the data and shapefile on.
188The values are functions of the year. Once we locate one, we
189call it with the year as an argument to get the final value.
190This is necessary for e.g. congressional districts, which have
191names that change every two years. It also turns out to be very
192helpful in dealing with little quirks in the data like changing
193the name of the COUSUB/COSUBFP column in just one year.
195We should add everything in
196https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html
197to this map.
198"""
201def geo_query_from_data_query_inner_geo(
202 year: int, geo_level: str
203) -> Tuple[Optional[str], str, List[str], List[str]]:
204 """
205 Map lookup and call the for the give year to produce the result.
207 Parameters
208 ----------
209 year
210 The year to pass to callable values.
211 geo_level
212 The key to look up.
214 Returns
215 -------
216 A tuple of results.
217 """
218 if geo_level not in _GEO_QUERY_FROM_DATA_QUERY_INNER_GEO:
219 raise CensusApiException(
220 "The with_geometry=True flag is only allowed if the "
221 f"geometry for the data to be loaded ('{geo_level}') is one of "
222 f"{list(_GEO_QUERY_FROM_DATA_QUERY_INNER_GEO.keys())}."
223 )
225 return _GEO_QUERY_FROM_DATA_QUERY_INNER_GEO[geo_level](year)
228def _geo_query_from_data_query_inner_geo_items(
229 year: int,
230) -> Generator[Tuple[Optional[str], str, List[str], List[str]], None, None]:
231 """Generate the items in `GEO_QUERY_FROM_DATA_QUERY_INNER_GEO`."""
232 for geo_level in _GEO_QUERY_FROM_DATA_QUERY_INNER_GEO:
233 yield geo_level, geo_query_from_data_query_inner_geo(year, geo_level)
236def _congressional_district_from_year(year: int) -> str:
237 """Construct the short form of the congressional district used in a given year."""
238 # See the files in https://www2.census.gov/geo/tiger/GENZ2020/shp/
239 # and similar. The interesting ones are of the form
240 #
241 # cb_20YY_us_cdCCC_500k.zip
242 #
243 # where YY is the year and CCC is the congressional district
244 # used.
245 #
246 # The mappings are not exactly at two year intervals as we would expect.
247 if year == 2020 or year == 2021:
248 # For some reason they did not update to cd117 for these years. Pandemic?
249 return "cd116"
251 # Regular year rule.
252 congress = 104 + (year - 1994) // 2
253 return f"cd{congress}"
256def infer_geo_level(year: Optional[int], df_data: pd.DataFrame) -> str:
257 """
258 Infer the geography level based on columns names.
260 Parameters
261 ----------
262 year
263 The vintage of the data. `None` to infer from the data.
264 df_data
265 A dataframe of variables with one or more columns that
266 can be used to infer what geometry level the rows represent.
268 For example, if the column `"STATE"` exists, we could infer that
269 the data in on a state by state basis. But if there are
270 columns for both `"STATE"` and `"COUNTY"`, the data is probably
271 at the county level.
273 If, on the other hand, there is a `"COUNTY" column but not a
274 `"STATE"` column, then there is some ambiguity. The data
275 probably corresponds to counties, but the same county ID can
276 exist in multiple states, so we will raise a
277 :py:class:`~CensusApiException` with an error message expalining
278 the situation.
280 If there is no match, we will also raise an exception. Again we
281 do this, rather than for example, returning `None`, so that we
282 can provide an informative error message about the likely cause
283 and what to do about it.
285 This function is not often called directly, but rather from
286 :py:func:`~add_inferred_geography`, which infers the geography
287 level and then adds a `geometry` column containing the appropriate
288 geography for each row.
290 Returns
291 -------
292 The name of the geography level.
293 """
294 match_key = None
295 match_on_len = 0
296 partial_match_keys = []
298 for k, (_, _, df_on, _) in _geo_query_from_data_query_inner_geo_items(year):
299 if all(col in df_data.columns for col in df_on):
300 # Full match. We want the longest full match
301 # we find.
302 if match_key is None or len(df_on) > match_on_len:
303 match_key = k
304 elif df_on[-1] in df_data.columns:
305 # Partial match. This could result in us
306 # not getting what we expect. Like if we
307 # have STATE and TRACT, but not COUNTY, we will
308 # get a partial match on [STATE, COUNTY. TRACT]
309 # and a full match on [STATE]. We probably did
310 # not mean to match [STATE], we just lost the
311 # COUNTY column somewhere along the way.
312 partial_match_keys.append(k)
314 if match_key is None:
315 raise CensusApiException(
316 f"Unable to infer geometry. Was not able to locate any of the "
317 "known sets of columns "
318 f"{tuple(df_on for _, (_, _, df_on, _) in _geo_query_from_data_query_inner_geo_items(year))} "
319 f"in the columns {list(df_data.columns)}."
320 )
322 if partial_match_keys:
323 raise CensusApiException(
324 f"Unable to infer geometry. Geometry matched {match_key} on columns "
325 f"{geo_query_from_data_query_inner_geo(year, match_key)[2]} "
326 "but also partially matched one or more candidates "
327 f"{tuple(geo_query_from_data_query_inner_geo(year, k)[2] for k in partial_match_keys)}. "
328 "Partial matches are usually unintended. Either add columns to allow a "
329 "full match or rename some columns to prevent the undesired partial match."
330 )
332 return match_key
335def add_geography(
336 df_data: pd.DataFrame,
337 year: Optional[VintageType],
338 shapefile_scope: str,
339 geo_level: str,
340 with_geometry_columns: bool = False,
341 tiger_shapefiles_only: bool = False,
342) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
343 """
344 Add geography to data.
346 Parameters
347 ----------
348 df_data
349 The data we downloaded from the census API
350 year
351 The year for which to fetch geometries. We need this
352 because they change over time. If `None`, look for a
353 `'YEAR'` column in `df_data` and possibly add different
354 geometries for different years as needed.
355 shapefile_scope
356 The scope of the shapefile. This is typically either a state
357 such as `NJ` or the string `"us"`.
358 geo_level
359 The geography level we want to add.
360 with_geometry_columns
361 If `True` keep all the additional columns that come with shapefiles
362 downloaded to get geometry information.
363 tiger_shapefiles_only
364 If `True` only look for TIGER shapefiles. If `False`, first look
365 for CB shapefiles
366 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),
367 which are more suitable for plotting maps, then fall back on the full
368 TIGER files
369 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
370 only if CB is not available. This is mainly set to `True` only
371 when `with_geometry_columns` is also set to `True`. The reason
372 is that the additional columns in the shapefiles are different
373 in the CB files than in the TIGER files.
375 Returns
376 -------
377 A GeoDataFrame with the original data and an
378 added geometry column for each row.
379 """
380 (
381 query_shapefile_scope,
382 shapefile_geo_level,
383 df_on,
384 gdf_on,
385 ) = geo_query_from_data_query_inner_geo(year, geo_level)
387 # If the query spec has a hard-coded value then we use it.
388 if query_shapefile_scope is not None:
389 shapefile_scope = query_shapefile_scope
391 def individual_shapefile(sub_scope: str, query_year: int) -> gpd.GeoDataFrame:
392 """Read the relevant shapefile and add a YEAR column to it."""
393 resolution = "5m" if shapefile_geo_level == "nation" else "500k"
395 try:
396 if tiger_shapefiles_only:
397 gdf = __shapefile_reader(query_year).read_shapefile(
398 sub_scope, shapefile_geo_level
399 )
400 else:
401 gdf = __shapefile_reader(query_year).try_cb_tiger_shapefile(
402 sub_scope, shapefile_geo_level, resolution=resolution
403 )
404 gdf["YEAR"] = query_year
405 return gdf
406 except cmap.MapException as ex:
407 # If there are some years where we can't find a shapefile,
408 # skip over it and those rows will not have geometry in the
409 # final result.
410 logger.info(
411 "Unable to load shapefile for scope %s for year %d",
412 sub_scope,
413 query_year,
414 exc_info=ex,
415 )
416 return gpd.GeoDataFrame()
418 # If there is a single defined year then we can load the single
419 # shapefile. If not, then we have to load multiple shapefiles,
420 # one per year, and concatenate them.
421 #
422 # Whether there is a single or multiple years, there could also
423 # me multiple scopes, e.g. states, for which we have to download
424 # shapefiles. If so, by the time we get here, they are encoded in
425 # one string with comma separators.
426 if isinstance(year, int):
427 gdf_shapefile = pd.concat(
428 individual_shapefile(sub_scope, year)
429 for sub_scope in shapefile_scope.split(",")
430 )
431 merge_gdf_on = gdf_on
432 else:
433 gdf_shapefile = pd.concat(
434 individual_shapefile(sub_scope, unique_year)
435 for unique_year in df_data["YEAR"].unique()
436 for sub_scope in shapefile_scope.split(",")
437 )
439 merge_gdf_on = ["YEAR"] + gdf_on
440 df_on = ["YEAR"] + df_on
442 if len(gdf_shapefile.index) == 0:
443 # None of the years matched, so we add None for geometry to all.
444 gdf = gpd.GeoDataFrame(df_data, copy=True)
445 gdf.set_geometry([None for _ in gdf.index], inplace=True)
446 return gdf
448 if "TRACT" in df_data.columns:
449 df_data["TRACT"] = df_data["TRACT"].str.ljust(
450 6, "0"
451 ) # Pre 2010 data has inconsistencies in TRACT string length
453 # The nation-level shapefile is missing a column
454 # to merge on.
455 if shapefile_geo_level == "nation":
456 gdf_shapefile["US"] = "1"
458 final_data_columns = df_data.columns.tolist()
460 # Strip out extra columns unless told to keep them.
461 if with_geometry_columns:
462 final_data_columns = final_data_columns + [
463 col
464 for col in gdf_shapefile.columns
465 if col not in merge_gdf_on
466 and col not in final_data_columns
467 and (col != "geometry")
468 ]
469 else:
470 gdf_shapefile = gdf_shapefile[merge_gdf_on + ["geometry"]]
472 gdf_data = gdf_shapefile.merge(
473 # There is a `NAME` column in the shapefile, so we use suffixes
474 # to avoid confusing it with the `NAME` that might be in df_data.
475 # Similar for any other conflict in names that are not being merged on.
476 df_data,
477 how="right",
478 left_on=merge_gdf_on,
479 right_on=df_on,
480 suffixes=("_shapefile", ""),
481 )
483 # Get the columns we want in a reasonable order matching
484 # how they are in the data, with geometry at the end.
485 gdf_data = gdf_data[final_data_columns + ["geometry"]]
487 # Rearrange columns so geometry is at the end.
488 gdf_data = gdf_data[
489 [col for col in gdf_data.columns if col != "geometry"] + ["geometry"]
490 ]
492 return gdf_data
495@dataclass
496class _ShapefileRoot:
497 """A private class to stash the root we will use to cache shapefiles locally."""
499 shapefile_root: Optional[Path] = None
502__shapefile_root = _ShapefileRoot()
503__shapefile_readers: Dict[int, cmap.ShapeReader] = {}
506def set_shapefile_path(shapefile_path: Union[Path, None]) -> None:
507 """
508 Set the path to the directory to cache shapefiles.
510 This is where we will cache shapefiles downloaded when
511 `with_geometry=True` is passed to :py:func:`~download`.
513 Parameters
514 ----------
515 shapefile_path
516 The path to use for caching shapefiles.
517 """
518 __shapefile_root.shapefile_root = shapefile_path
521def get_shapefile_path() -> Union[Path, None]:
522 """
523 Get the path to the directory to cache shapefiles.
525 This is where we will cache shapefiles downloaded when
526 `with_geometry=True` is passed to :py:func:`~download`.
528 Returns
529 -------
530 The path to use for caching shapefiles.
531 """
532 return __shapefile_root.shapefile_root
535def __shapefile_reader(year: int):
536 reader = __shapefile_readers.get(year, None)
538 if reader is None:
539 reader = cmap.ShapeReader(
540 __shapefile_root.shapefile_root,
541 year,
542 )
544 __shapefile_readers[year] = reader
546 return reader
549def _identify_counties(gdf_geo: gpd.GeoDataFrame, year: int):
550 """
551 Take a geodataframe and identify which US counties the supplied geography overlaps.
553 Parameters
554 ----------
555 gdf_geo
556 A GeoDataFrame containing polygons within the United States
557 year
558 The year for which to fetch geometries. We need this
559 because they change over time.
561 Returns
562 -------
563 A list of five digit county FIPS codes.
564 """
565 # Some dataframes will contain the county column already
566 if "STATE" in gdf_geo and "COUNTY" in gdf_geo:
567 fips_codes = gdf_geo["STATE"] + gdf_geo["COUNTY"]
569 return fips_codes.unique().tolist()
570 # Otherwise, we load all the US counties and perform an overlap operation
571 else:
572 reader = __shapefile_readers.get(year)
573 us_counties = reader.read_cb_shapefile("us", "county")
574 us_counties["FIPS"] = us_counties["STATEFP"] + us_counties["COUNTYFP"]
576 county_overlap = us_counties.overlay(gdf_geo, keep_geom_type=False)
577 fips_codes = county_overlap["STATEFP"] + county_overlap["COUNTYFP"]
579 return fips_codes.unique().tolist()
582def clip_water(
583 gdf_geo: gpd.GeoDataFrame,
584 year: int,
585 minimum_area_sq_meters: int = 10000,
586 sliver_threshold=0.01,
587):
588 """
589 Remove water from input `GeoDataFrame`.
591 Parameters
592 ----------
593 gdf_geo
594 The GeoDataFrame from which we want to remove water
595 year
596 The year for which to fetch geometries. We need this
597 because they change over time.
598 minimum_area_sq_meters
599 The minimimum size of a water area to be removed
601 Returns
602 -------
603 A GeoDataFrame with the water areas larger than
604 the specified threshold removed.
605 """
606 counties = _identify_counties(gdf_geo, year)
607 gdf_water = _retrieve_water(counties, year)
609 gdf_without_water = _water_difference(gdf_geo, gdf_water, minimum_area_sq_meters)
611 original_crs = gdf_without_water.crs
612 gdf_without_water = drop_slivers_from_gdf(
613 gdf_without_water.to_crs(epsg=3857), threshold=sliver_threshold
614 ).to_crs(original_crs)
616 return gdf_without_water
619def _retrieve_water(county_fips_codes: list[str], year: int):
620 """
621 Load `AREAWATER` files from tiger for specified counties.
623 Parameters
624 ----------
625 county_FIPS_codes
626 A list of five digit county FIPS codes
628 Returns
629 -------
630 A GeoDataFrame containing the census defined water in the supplied counties
631 """
632 reader = __shapefile_readers.get(year)
634 gdf_water = pd.concat(
635 reader.read_shapefile(shapefile_scope=county, geography="areawater")
636 for county in county_fips_codes
637 )
638 # Geo pandas has no concat method, so we convert from a pandas df
639 gdf_water = gpd.GeoDataFrame(gdf_water)
640 return gdf_water
643def _water_difference(
644 gdf_geo: gpd.GeoDataFrame, gdf_water: gpd.GeoDataFrame, minimum_area_sq_meters: int
645):
646 """
647 Remove water polygons exceeding minimum size from supplied `GeoDataFrame`.
649 Parameters
650 ----------
651 gdf_geo
652 A GeoDataFrame containing polygons within the United States
653 gdf_water
654 A GeoDataFrame containing census AREAWATER polygons
655 minimum_area_sq_meters
656 The smallest water polygon to be removed, specified in square meters
658 Returns
659 -------
660 A version of gdf_geo with the water areas removed
661 """
662 # Combining polygons speeds up the overlay operation
663 geo_combined_water = gdf_water[
664 gdf_water["AWATER"] >= minimum_area_sq_meters
665 ].union_all()
666 gdf_combined_water = gpd.GeoDataFrame(geometry=[geo_combined_water])
667 gdf_combined_water = gdf_combined_water.set_crs(gdf_water.crs)
669 return gdf_geo.overlay(
670 gdf_combined_water,
671 "difference",
672 keep_geom_type=False,
673 )