Coverage for censusdis/maps.py: 94%
331 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
1# Copyright (c) 2022 Darren Erik Vengroff
2"""
3Utilities for loading and rendering maps.
5This module relies on shapefiles from the US Census,
6which it downloads as needed and caches locally.
7"""
9import importlib.resources
10import shutil
11from logging import getLogger
12from pathlib import Path
13from typing import Dict, Optional, Tuple, Union, Any
14from zipfile import BadZipFile, ZipFile
16import contextily as cx
17import geopandas as gpd
18import matplotlib.pyplot as plt
19import pandas as pd
20import requests
21import shapely.affinity
22from haversine import haversine
23from shapely.geometry import MultiPolygon, Point, Polygon
24from shapely.geometry.base import BaseGeometry
25import matplotlib.patheffects as pe
27from censusdis.impl.exceptions import CensusApiException
28from censusdis.impl.fetch import certificates
29from censusdis.states import AK, HI, NAMES_FROM_IDS, PR
31logger = getLogger(__name__)
34class MapException(CensusApiException):
35 """An exception generated from `censusdis.maps` code."""
38class ShapeReader:
39 """
40 A class for reading shapefiles into GeoPandas GeoDataFrames.
42 See the demo notebooks for more details. The shapefiles need
43 to already have been downloaded to the local machine. We may
44 add a lazy option in the future that will fetch them if they
45 don't exist.
47 Parameters
48 ----------
49 shapefile_root
50 The location in the filesystem where shapefiles are stored.
51 year
52 The year we want shapefiles for,
53 auto_fetch
54 If `True` then fetch remote shape files as needed.
55 """
57 def __init__(
58 self,
59 shapefile_root: Optional[Union[str, Path]] = None,
60 year: int = 2020,
61 auto_fetch: bool = True,
62 ):
63 if shapefile_root is None:
64 shapefile_root = Path.home() / ".censusdis" / "data" / "shapefiles"
65 shapefile_root.mkdir(exist_ok=True, parents=True)
66 else:
67 shapefile_root = Path(shapefile_root)
69 self._shapefile_root = shapefile_root
70 self._year = year
71 self._auto_fetch = auto_fetch
73 @property
74 def shapefile_root(self) -> Path:
75 """The path at which shapefiles are cached locally."""
76 return self._shapefile_root
78 def _read_shapefile(
79 self, base_name: str, base_url: str, crs, timeout: int
80 ) -> gpd.GeoDataFrame:
81 """Read a shapefile."""
82 self._auto_fetch_file(base_name, base_url, timeout=timeout)
84 path = self._shapefile_full_path(base_name)
86 gdf = gpd.read_file(path)
87 if crs is not None:
88 gdf.to_crs(crs, inplace=True)
89 return gdf
91 def _shapefile_full_path(self, basename: str) -> Path:
92 """Construct the full path to a shapefile."""
93 path = self._shapefile_root / basename / f"{basename}.shp"
94 return path
96 def _2008_2009_tiger(self, prefix, shapefile_scope: str, suffix) -> Tuple[str, str]:
97 # Sometimes we have to do down into a named
98 # state directory.
99 if suffix in ["cosub", "tract", "bg"]:
100 state = shapefile_scope
101 state_name = NAMES_FROM_IDS[state].upper().replace(" ", "_")
103 base_url = f"https://www2.census.gov/geo/tiger/TIGER{self._year}/{state}_{state_name}"
104 name = f"{prefix}_{self._year}_{state}_{suffix}00"
105 else:
106 # TODO - what to do here?
107 base_url, name = self._through_2010_tiger(prefix, shapefile_scope, suffix)
109 return base_url, name
111 def _through_2010_tiger(self, prefix, shapefile_scope: str, suffix):
112 # Curiously, the server side puts the 2000 files under
113 # the TIGER2010 directory early in the path and early
114 # in the file name.
115 path_year = self._year
116 path_year = max(path_year, 2010)
118 base_url = f"https://www2.census.gov/geo/tiger/TIGER{path_year}/{suffix.upper()}/{self._year}"
119 name = f"{prefix}_{path_year}_{shapefile_scope}_{suffix}{str(self._year)[-2:]}"
120 return base_url, name
122 def _post_2010_tiger(self, prefix, shapefile_scope: str, suffix):
123 # Special case for whatever reason the US Census decided.
124 base_url = (
125 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/{suffix.upper()}"
126 )
128 if self._year == 2020 and suffix == "tabblock":
129 base_url = f"{base_url}20"
130 suffix = f"{suffix}20"
132 if self._year in [2020, 2021] and suffix == "puma":
133 suffix = f"{suffix}10"
135 name = f"{prefix}_{self._year}_{shapefile_scope}_{suffix}"
136 return base_url, name
138 def tiger_url(self, prefix, shapefile_scope, suffix) -> Tuple[str, str]:
139 """Construct a URL for a TIGER file."""
140 if self._year == 2010 or self._year < 2008:
141 return self._through_2010_tiger(prefix, shapefile_scope, suffix)
142 elif 2008 <= self._year <= 2009:
143 return self._2008_2009_tiger(prefix, shapefile_scope, suffix)
144 else:
145 return self._post_2010_tiger(prefix, shapefile_scope, suffix)
147 def _tiger(self, shapefile_scope: str, geography, crs, timeout: int):
148 prefix, suffix = ("tl", geography)
150 base_url, name = self.tiger_url(prefix, shapefile_scope, suffix)
152 gdf = self._read_shapefile(name, base_url, crs, timeout=timeout)
154 # Pull off the extra two digits of year that get tacked
155 # on in some cases.
157 def mapper(col: str) -> str:
158 if col.endswith(("20", "10", "00")):
159 return col[:-2]
160 return col
162 gdf.rename(mapper, axis="columns", inplace=True)
164 if "STATEFP" not in gdf.columns:
165 gdf["STATEFP"] = shapefile_scope
167 return gdf
169 # The summary level to use for each of the resolutions we
170 # support.
171 # See https://www2.census.gov/geo/tiger/GENZ2010/ReadMe.pdf
172 _CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010 = {
173 "state": "040",
174 "county": "050",
175 "cousub": "060",
176 "tract": "140",
177 "bg": "150",
178 }
180 def _through_2010_cb(
181 self, cartographic_scope: str, geography: str, resolution: str
182 ):
183 if geography not in self._CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010:
184 raise MapException(
185 "Don't know how to interpret geography '%s' for pre-2010 maps.",
186 geography,
187 )
189 summary_level = self._CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010[geography]
191 name = f"gz_{self._year}_{cartographic_scope}_{summary_level}_00_{resolution}"
192 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}"
194 return base_url, name
196 def _post_2010_cb(self, cartographic_scope: str, geography, resolution: str):
197 # May need to revise when 2020 PUMA is published.
198 if geography == "puma" and 2010 <= self._year < 2022:
199 geography = "puma10"
200 elif geography == "puma" and self._year >= 2022:
201 geography = "puma20"
203 name = f"cb_{self._year}_{cartographic_scope}_{geography}_{resolution}"
205 # The shp subdirectory did not appear until 2014.
206 if self._year >= 2014:
207 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}/shp"
208 else:
209 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}"
211 return base_url, name
213 def _cartographic_bound(
214 self, shapefile_scope, geography, resolution, crs, *, timeout: int
215 ) -> gpd.GeoDataFrame:
216 if self._year <= 2010:
217 base_url, name = self._through_2010_cb(
218 shapefile_scope, geography, resolution
219 )
220 else:
221 base_url, name = self._post_2010_cb(shapefile_scope, geography, resolution)
223 gdf = self._read_shapefile(name, base_url, crs, timeout=timeout)
225 # Some files on the server, like
226 # https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_050_00_500k.zip
227 # leave the 'FP' suffix of column names.
228 gdf.rename(
229 {
230 "STATE": "STATEFP",
231 "COUNTY": "COUNTYFP",
232 "TRACT": "TRACTCE",
233 "BLKGRP": "BLKGRPCE",
234 },
235 axis="columns",
236 inplace=True,
237 )
239 return gdf
241 def read_shapefile(
242 self, shapefile_scope: str, geography: str, crs=None, *, timeout: int = 30
243 ):
244 """
245 Read the geometries of geographies.
247 This method reads maps suitable
248 for use with geometric joins and queries of various types. If you are
249 only interested in plotting maps, the
250 :py:meth:`~ShapeReader.read_cb_shapefile` method may be more suitable.
252 The files are read from the US Census servers and cached locally.
253 They are in most cases the same files you can download manually from
254 https://www.census.gov/cgi-bin/geo/shapefiles/index.php.
256 Individual files the API may download follow a naming convention
257 that has evolved a bit over time. So for example a 2010 block group file
258 for New Jersey would be found at
259 https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_34_bg10.zip
260 whereas a similar file for 2020 would be at
261 https://www2.census.gov/geo/tiger/TIGER2020/BG/tl_2020_34_bg.zip.
263 This method knows many of the subtle changes that have occurred over the years,
264 so you should mostly not have to worry about them. It is unlikely it knows
265 them all, so please submit an
266 issue at https://github.com/vengroff/censusdis/issues if you find
267 otherwise.
269 Once read, the files are cached locally so that when we reuse the same
270 files we do not have to go back to the server.
272 Parameters
273 ----------
274 shapefile_scope
275 The geography that is covered by the entire shapefile. In some
276 cases, this is a state, e.g. `NJ`. For cases where files
277 are available for the entire country, the string `"us"` is typically
278 used. In some rare cases, like for the Alaska Native Regional Corporations
279 (``"anrc"``) geography, other strings like ``"02"`` are used.
280 See the dowload links at
281 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
282 if you need to debug issues with a given geography.
283 geography
284 The geography we want to download bounds for. Supported
285 geometries are `"state'`, `"county"`, `"cousub"` (county subdivision),
286 `"tract"`, and `"bg"` (block group). Other geometries as defined
287 by the US Census may work, but have not been thoroughly tested.
288 crs
289 The crs to make the file to. If `None`, use the default
290 crs of the shapefile. Setting this is useful if we plan
291 to merge the resulting `GeoDataFrame` with another so we
292 can make sure they use the same crs.
293 timeout
294 Time out limit (in seconds) for the remote call.
296 Returns
297 -------
298 A `gpd.GeoDataFrame` containing the requested
299 geometries.
300 """
301 return self._tiger(shapefile_scope, geography, crs, timeout=timeout)
303 def read_cb_shapefile(
304 self,
305 shapefile_scope: str,
306 geography: str,
307 resolution: str = "500k",
308 crs=None,
309 *,
310 timeout: int = 30,
311 ) -> gpd.GeoDataFrame:
312 """
313 Read the cartographic boundaries of a given geography.
315 These are smaller
316 files suited for plotting maps, as compared to those returned by
317 :py:meth:`~ShapeReader.read_shapefile`, which returns higher
318 resolution geometries.
320 The files are read from the US Census servers and cached locally.
321 They are in most cases the same files you can download manually from
322 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
323 or similar URLs for other years.
325 Individual files the API may download follow a naming convention
326 that has evolved a bit over time. So for example a 2010 census tract
327 cartographic bounds file
328 for New Jersey at 500,000:1 resolution would be found at
329 https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_34_140_00_500k.zip
330 whereas a similar file for 2020 would be at
331 https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_34_tract_500k.zip
333 This method knows many of the subtle changes that have occurred over the years,
334 so you should mostly not have to worry about them. It is unlikely it knows
335 them all, so please submit an
336 issue at https://github.com/vengroff/censusdis/issues if you find
337 otherwise.
339 Once read, the files are cached locally so that when we reuse the same
340 files we do not have to go back to the server.
342 Parameters
343 ----------
344 shapefile_scope
345 The geography that is covered by the entire shapefile. In some
346 cases, this is a state, e.g. `NJ`. For cases where files
347 are available for the entire country, the string `"us"` is typically
348 used. In some rare cases, like for the Alaska Native Regional Corporations
349 (``"anrc"``) geography, other strings like ``"02"`` are used.
350 See the dowload links at
351 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
352 if you need to debug issues with a given geography.
353 geography
354 The geography we want to download bounds for. Supported
355 geometries are `"state'`, `"county"`, `"cousub"` (county subdivision),
356 `"tract"`, and `"bg"` (block group)
357 resolution
358 What resolution shapes should we use. Permitted options are
359 `"500k"`, `"5m"`, and `"20m"` for 1:500,000, 1:5,000,000, and
360 1:20,000,000 resolution respectively. Availability varies, but for
361 most geographies `"500k"` is available even if others are not.
362 crs
363 The crs to make the file to. If `None`, use the default
364 crs of the shapefile. Setting this is useful if we plan
365 to merge the resulting `GeoDataFrame` with another so we
366 can make sure they use the same crs.
367 timeout
368 Time out limit (in seconds) for the remote call.
370 Returns
371 -------
372 A `gpd.GeoDataFrame` containing the boundaries of the requested
373 geometries.
374 """
375 return self._cartographic_bound(
376 shapefile_scope, geography, resolution, crs, timeout=timeout
377 )
379 def try_cb_tiger_shapefile(
380 self,
381 shapefile_scope: str,
382 geography: str,
383 resolution: str = "500k",
384 crs=None,
385 *,
386 timeout: int = 60,
387 ) -> gpd.GeoDataFrame:
388 """
389 Try to retrieve CB file.
391 Wraps read_cb_shapefile and read_shapefile.
393 If unable to find, attempts to find the full tiger file.
395 Returns
396 -------
397 A `gpd.GeoDataFrame` containing the boundaries of the requested
398 geometries.
399 """
400 try:
401 gdf = self._cartographic_bound(
402 shapefile_scope, geography, resolution, crs, timeout=timeout
403 )
404 except MapException as e:
405 logger.debug("Exception loading cb file. Trying tiger instead.", e)
406 gdf = self._tiger(shapefile_scope, geography, crs, timeout=timeout)
408 return gdf
410 def _auto_fetch_file(self, name: str, base_url: str, *, timeout: int):
411 if not self._auto_fetch:
412 return
414 self._fetch_file(name, base_url, timeout=timeout)
416 def _url_for_file(self, name: str) -> str:
417 if name.startswith("cb_"):
418 return f"https://www2.census.gov/geo/tiger/GENZ{self._year}/shp/{name}.zip"
420 if name.startswith("tl_"):
421 suffix = name.split("_")[-1]
423 if self._year <= 2010:
424 return (
425 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/"
426 f"{suffix.upper()[:-2]}/{self._year}/{name}.zip"
427 )
429 return (
430 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/"
431 f"{suffix.upper()}/{name}.zip"
432 )
434 # This will not work, but it's the main download page where we
435 # can start to look for what we want.
436 return "https://www.census.gov/cgi-bin/geo/shapefiles/index.php"
438 def _fetch_file(
439 self,
440 name: str,
441 base_url: str,
442 *,
443 timeout: int,
444 ) -> None:
445 dir_path = self._shapefile_root / name
447 if dir_path.is_dir():
448 # Does it have the .shp file? If not maybe something
449 # random went wrong in the previous attempt, or someone
450 # deleted some stuff by mistake. So delete it and
451 # reload.
452 shp_path = dir_path / f"{name}.shp"
453 if shp_path.is_file():
454 # Looks like the shapefile is there.
455 return
457 # No shapefile so remove the whole directory and
458 # hope for the best when we recreate it.
459 shutil.rmtree(dir_path)
461 # Make the directory
462 dir_path.mkdir()
464 # We will put the zip file in the dir we just created.
465 zip_path = dir_path / f"{name}.zip"
467 # Construct the URL to get the zip file.
468 # url = self._url_for_file(name)
469 zip_url = f"{base_url}/{name}.zip"
471 # Fetch the zip file and write it.
472 response = requests.get(
473 zip_url,
474 timeout=timeout,
475 cert=certificates.map_cert,
476 verify=certificates.map_verify,
477 )
479 if response.status_code == 404:
480 raise MapException(
481 f"{zip_url} was not found. "
482 "The Census Bureau may not publish the shapefile you are looking for for the given year. "
483 "Or the file you are looking for may be from a year where a naming convention that censusdis "
484 "does not recognize was used."
485 )
487 headers = response.headers
488 content_type = headers.get("Content-Type", None)
490 if content_type != "application/zip":
491 raise MapException(
492 f"Expected content type application/zip' from {zip_url}, but got '{content_type}' instead."
493 )
495 with zip_path.open("wb") as file:
496 file.write(response.content)
498 # Unzip the file and extract all contents.
499 try:
500 with ZipFile(zip_path) as zip_file:
501 zip_file.extractall(dir_path)
502 except BadZipFile as exc:
503 raise MapException(f"Bad zip file retrieved from {zip_url}") from exc
504 finally:
505 # We don't need the zipfile anymore.
506 zip_path.unlink()
509def clip_to_states(gdf, gdf_bounds):
510 """
511 Clip every geometry in a gdf to the state it belongs to, from the states in the state bounds.
513 We clip to state bounds so that we don't plot areas
514 outside the state. Typically, this clips areas that
515 extend out into the water in coastal areas so we don't
516 get strange artifacts in the water in plots.
518 The way we tell what state an input geometry belongs to
519 is by looking at the `STATEFP` column for that geometry's
520 row in the input.
522 Parameters
523 ----------
524 gdf
525 The input geometries.
526 gdf_bounds
527 The state bounds.
529 Returns
530 -------
531 The input geometries where each is clipped to the bounds
532 of the state to which it belongs.
533 """
534 return (
535 gdf.groupby(gdf.STATEFP)
536 .apply(lambda s: gpd.clip(s, gdf_bounds[gdf_bounds.STATEFP == s.name]))
537 .droplevel("STATEFP")
538 )
541def _wrap_poly(poly: Union[Polygon, Point]):
542 """
543 Move a polygon.
545 Used in shifting AK and HI geometries.
546 """
547 if isinstance(poly, Polygon):
548 x_coord, _ = poly.exterior.coords.xy
549 elif isinstance(poly, Point):
550 x_coord = [poly.x]
551 else:
552 # Not sure how to parse it, so leave it
553 # where it is.
554 logger.warning(f"Unrecognized type {type(poly)} can't be wrapped.")
555 return poly
557 if x_coord[0] > 0:
558 poly = shapely.affinity.translate(poly, xoff=-360.0, yoff=0.0)
559 return poly
562def _wrap_polys(polys):
563 """
564 Move the polygons in a multi-polygon.
566 Used in shifting AK and HI geometries.
567 """
568 # Just in case it's not a MultiPolygon
569 if not isinstance(polys, MultiPolygon):
570 return _wrap_poly(polys)
571 wrapped_polys = [_wrap_poly(p) for p in polys.geoms]
572 return MultiPolygon(wrapped_polys)
575# Boxes that contain AK and HI after _wrap_polys has
576# been applied to it. We use this to identify
577# geometries that we want to relocate in relocate_ak_hi
578# when we don't have a STATEFP or STATE column to help
579# identify what is in AK or HI.
581_AK_MIN_X = -188.0
582_AK_MIN_Y = 51.0
583_AK_MAX_X = -129.0
584_AK_MAX_Y = 72.0
586_AK_BOUNDS = Polygon(
587 (
588 (_AK_MIN_X, _AK_MIN_Y),
589 (_AK_MAX_X, _AK_MIN_Y),
590 (_AK_MAX_X, _AK_MAX_Y),
591 (_AK_MIN_X, _AK_MAX_Y),
592 (_AK_MIN_X, _AK_MIN_Y),
593 )
594)
596_HI_MIN_X = -179.0
597_HI_MIN_Y = 18.0
598_HI_MAX_X = -154.0
599_HI_MAX_Y = 29.0
601_HI_BOUNDS = Polygon(
602 (
603 (_HI_MIN_X, _HI_MIN_Y),
604 (_HI_MAX_X, _HI_MIN_Y),
605 (_HI_MAX_X, _HI_MAX_Y),
606 (_HI_MIN_X, _HI_MAX_Y),
607 (_HI_MIN_X, _HI_MIN_Y),
608 )
609)
611_PR_MIN_X = -68.0
612_PR_MIN_Y = 17.0
613_PR_MAX_X = -65.0
614_PR_MAX_Y = 19.0
616_PR_BOUNDS = Polygon(
617 (
618 (_PR_MIN_X, _PR_MIN_Y),
619 (_PR_MAX_X, _PR_MIN_Y),
620 (_PR_MAX_X, _PR_MAX_Y),
621 (_PR_MIN_X, _PR_MAX_Y),
622 (_PR_MIN_X, _PR_MIN_Y),
623 )
624)
627def _relocate_ak(geo: BaseGeometry) -> BaseGeometry:
628 """
629 Relocate a geometry that is already known to be in the AK bounding box.
631 Parameters
632 ----------
633 geo
634 The geometry.
636 Returns
637 -------
638 The relocated geometry.
639 """
640 ak_scale_x = 0.25
641 ak_scale_y = 0.4
642 ak_x = 33
643 ak_y = -34
644 ak_origin = (-149.9003, 61.2181) # Anchorage
645 geo = shapely.affinity.scale(
646 geo, xfact=ak_scale_x, yfact=ak_scale_y, origin=ak_origin
647 )
648 geo = shapely.affinity.translate(geo, xoff=ak_x, yoff=ak_y)
650 return geo
653def _relocate_hi(geo: BaseGeometry) -> BaseGeometry:
654 """
655 Relocate a geometry that is already known to be in the HI bounding box.
657 Parameters
658 ----------
659 geo
660 The geometry.
662 Returns
663 -------
664 The relocated geometry.
665 """
666 hi_x = 50
667 hi_y = 6
669 geo = shapely.affinity.translate(geo, xoff=hi_x, yoff=hi_y)
671 return geo
674def _relocate_pr(geo: BaseGeometry) -> BaseGeometry:
675 """
676 Relocate a geometry that is already known to be in the PR bounding box.
678 Parameters
679 ----------
680 geo
681 The geometry.
683 Returns
684 -------
685 The relocated geometry.
686 """
687 pr_x = -7
688 pr_y = 8
690 geo = shapely.affinity.translate(geo, xoff=pr_x, yoff=pr_y)
692 return geo
695def _relocate_parts_in_ak_hi_pr(geo: BaseGeometry) -> BaseGeometry:
696 """
697 Relocate any sub-geometries that happen to fall in the AK or HI or PR bounding boxes.
699 If the geometry is a simple polygon, check if it intersects the
700 bounding boxes of AK or HI and relocate if so. If it is a
701 `MultiPolygon` then recurse in and relocate some
702 contained geometries as appropriate. This way it can work on small
703 polygons completely contained in the bounding box, or on larger
704 multi-polygons like regions that may have some polygons in the bounding
705 box and others outside it.
707 Parameters
708 ----------
709 geo
710 The geography.
712 Returns
713 -------
714 The geography, possibly with some parts relocated.
715 """
716 if isinstance(geo, MultiPolygon):
717 relocated_geos = [_relocate_parts_in_ak_hi_pr(g) for g in geo.geoms]
718 return MultiPolygon(relocated_geos)
720 # It is an individual polygon. So see if it is
721 # in a box that should be relocated.
723 if geo.intersects(_AK_BOUNDS):
724 geo = _relocate_ak(geo)
725 elif geo.intersects(_HI_BOUNDS):
726 geo = _relocate_hi(geo)
727 elif geo.intersects(_PR_BOUNDS):
728 geo = _relocate_pr(geo)
730 return geo
733def _wrap_and_relocate_geos(geo: BaseGeometry):
734 geo = _wrap_polys(geo)
735 return _relocate_parts_in_ak_hi_pr(geo)
738def _relocate_ak_hi_pr_group(group):
739 """
740 Relocate a group of geometries.
742 They are relocated if they belong to AK, HI or PR, otherwise
743 they are left alone.
744 """
745 if group.name == AK:
746 # Deal with the Aleutian islands wrapping at -180/180 longitude.
747 group.geometry = group.geometry.apply(_wrap_polys)
748 # Relocate
749 group.geometry = group.geometry.apply(_relocate_ak)
750 elif group.name == HI:
751 group.geometry = group.geometry.apply(_relocate_hi)
752 elif group.name == PR:
753 group.geometry = group.geometry.apply(_relocate_pr)
755 return group
758def relocate_ak_hi_pr(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
759 """
760 Relocate any geometry that is in Alaska or Hawaii for plotting purposes.
762 We first try an optimization. If there is a `STATEFP`
763 column then we relocate rows where that column has a value of
764 `AK`, `HI` or `PR`. If there is not a
765 `STATEFP` column we check for a `STATE` column and do the same. If neither
766 column exists then we dig down into the geometries themselves
767 and relocate those that intersect bounding rectangles of the
768 two states.
770 Note: the expectation is that the crs or the incoming geo data frame
771 is EPSG:4269 or something that closely approximates it, in units of
772 degrees of latitude and longitude. If this is not the case, results
773 are unpredictable.
775 Parameters
776 ----------
777 gdf
778 the geo data frame to relocate.
780 Returns
781 -------
782 a geo data frame with any geometry in AK or HI moved for plotting.
783 """
784 if "STATEFP" in gdf.columns or "STATE" in gdf.columns:
785 # There is a column idenfyig the state of each geometry
786 # so use that to decide what to relocate.
787 if "STATEFP" in gdf.columns:
788 group_column = "STATEFP"
789 else:
790 group_column = "STATE"
792 gdf = (
793 gdf.groupby(gdf[group_column], group_keys=True, sort=False)
794 .apply(
795 _relocate_ak_hi_pr_group,
796 include_groups=False,
797 )
798 .reset_index(level=1, drop=True)
799 .reset_index(drop=False)
800 )
801 else:
802 # There is no column indicating the state of each geometry. This
803 # is often because the geometries span states. So we can't easily
804 # relocate the two states, but we least wrap the Aleutian
805 # islands if present and then relocate any geometries that are
806 # in the bounding boxes of AK and HI.
807 gdf = gdf.copy()
808 gdf.geometry = gdf.geometry.map(_wrap_and_relocate_geos)
810 return gdf
813__gdf_crs_bounds: Optional[gpd.GeoDataFrame] = None
814"""The bounds of the all CRSs we might use in `plot_map`."""
817def _gdf_crs_bounds() -> gpd.GeoDataFrame:
818 """
819 Construct a dataframe witn the bound of all the CRSs we might use in `plot_map`.
821 Returns
822 -------
823 The dataframe. It is a singleton you should not modify.
824 """
825 global __gdf_crs_bounds
827 if __gdf_crs_bounds is None:
828 with importlib.resources.path(
829 f"{__package__}.resources", "crs_bounds.geojson"
830 ) as path:
831 __gdf_crs_bounds = gpd.GeoDataFrame.from_file(path)
833 return __gdf_crs_bounds
836def _closest_epsg(
837 gdf: gpd.GeoDataFrame,
838) -> int:
839 """
840 Find the EPSG to use by choosing the one closest to the center of the bounds of the gdf.
842 We do this by looking at haversine distance between the
843 centers. I am sure we could do this more efficiently
844 than a linear scan using a data structure optimized for
845 nearest neighbor queries on the surface of a sphere.
846 But we only have about 100 candidates, so the linear scan is
847 simple and not terrible.
849 Parameters
850 ----------
851 gdf
852 The gdf we want to plot.
854 Returns
855 -------
856 The EPSG number.
857 """
858 gdf_crs_bounds = _gdf_crs_bounds()
860 if gdf.crs != gdf_crs_bounds.crs:
861 gdf = gdf.copy(deep=True).to_crs(gdf_crs_bounds.crs)
863 total_bounds = gdf.total_bounds
865 # Wrap the end of the Aleutian island chain.
866 if total_bounds[0] > 0:
867 total_bounds[0] = total_bounds[0] - 360.0
868 if total_bounds[2] > 0:
869 total_bounds[2] = total_bounds[2] - 360.0
871 lon, lat = (
872 (total_bounds[0] + total_bounds[2]) / 2,
873 (total_bounds[1] + total_bounds[3]) / 2,
874 )
876 centers = gdf_crs_bounds.representative_point()
878 epsg = gdf_crs_bounds["epsg"].iloc[
879 centers.map(lambda center: haversine((center.y, center.x), (lat, lon))).argmin()
880 ]
882 return epsg
885def plot_map(
886 gdf: gpd.GeoDataFrame,
887 *args,
888 with_background: bool = False,
889 epsg: Optional[int] = None, # 3309, # 4269,
890 geo_label: Optional[Union[str, pd.Series]] = None,
891 geo_label_text_kwargs: Optional[Dict[str, Any]] = None,
892 **kwargs,
893) -> plt.Axes:
894 """
895 Plot a map, optionally with a background.
897 Parameters
898 ----------
899 gdf
900 The geo data frame to plot
901 args
902 Optional args to matplotlib
903 with_background
904 Should we put in a background map from Open Street maps?
905 epsg
906 The EPSG to project to. Otherwise a suitable one for the
907 geometry will be inferred.
908 geo_label
909 Name of a column in `gdf` that has labels for each geometry.
910 geo_label_text_kwargs
911 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}`
912 kwargs
913 keyword args to pass on to matplotlib
915 Returns
916 -------
917 The ax of the resulting plot.
918 """
919 if epsg is None:
920 epsg = _closest_epsg(gdf)
922 gdf = gdf.to_crs(epsg=epsg)
924 ax = gdf.plot(*args, **kwargs)
926 if geo_label is not None:
927 _add_plot_map_geo_labels(ax, gdf, geo_label, geo_label_text_kwargs)
929 ax.tick_params(
930 left=False,
931 right=False,
932 bottom=False,
933 labelleft=False,
934 labelbottom=False,
935 )
937 if with_background:
938 provider = cx.providers.OpenStreetMap.Mapnik
940 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider)
942 return ax
945def _add_plot_map_geo_labels(
946 ax: plt.Axes,
947 gdf: gpd.GeoDataFrame,
948 geo_label: Union[str, pd.Series],
949 geo_label_text_kwargs: Optional[Dict[str, Any]],
950) -> plt.Axes:
951 """
952 Add labels to each geometry in a plot.
954 Internal utility function behind `plot_map` and `plot_us`.
956 Parameters
957 ----------
958 ax
959 The ax we already plotted on.
960 gdf
961 The geographic data frame we are plotting.
962 geo_label
963 The column in `gdf` that contains the labels to add to the plot.
964 geo_label_text_kwargs
965 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}`
967 Returns
968 -------
969 `ax`
970 """
971 if geo_label_text_kwargs is None:
972 geo_label_text_kwargs = {}
974 default_text_kwargs = dict(
975 ha="center",
976 va="center",
977 color="#333",
978 size=9,
979 path_effects=[pe.withStroke(linewidth=4, foreground="white")],
980 )
982 geo_label_text_kwargs = dict(**default_text_kwargs) | geo_label_text_kwargs
984 for idx, row in gdf.iterrows():
985 rep = row["geometry"].representative_point()
987 if isinstance(geo_label, str):
988 name = row[geo_label]
989 else:
990 name = geo_label.loc[idx]
992 ax.text(rep.x, rep.y, name, **geo_label_text_kwargs)
994 return ax
997def plot_us(
998 gdf: gpd.GeoDataFrame,
999 *args,
1000 do_relocate_ak_hi_pr: bool = True,
1001 with_background: bool = False,
1002 epsg: int = 9311,
1003 geo_label: Optional[Union[str, pd.Series]] = None,
1004 geo_label_text_kwargs: Optional[Dict[str, Any]] = None,
1005 **kwargs,
1006) -> plt.Axes:
1007 """
1008 Plot a map of the US with AK and HI relocated.
1010 This function will move and scale AK and
1011 HI so that they are plotted at the lower left of the other 48 states,
1012 just below CA, AZ, and NM.
1014 It also moves the Aleutian islands that are west of -180° longitude
1015 so that they are plotted next to the rest of AK. Otherwise, they
1016 tend to be plotted at longitudes just less than +180°, which
1017 creates visual discontinuities.
1019 Note: the expectation is that the crs or the incoming geo data frame
1020 is EPSG:4269 or something that closely approximates it, in units of
1021 degrees of latitude and longitude. If this is not the case, results
1022 are unpredictable.
1024 Parameters
1025 ----------
1026 gdf
1027 The geometries to be plotted.
1028 do_relocate_ak_hi_pr
1029 If `True` try to relocate AK, HI, and PR. Otherwise, still wrap
1030 the Aleutian islands west of -180° longitude if present.
1031 args
1032 Args to pass to the plot.
1033 with_background
1034 Should we put in a background map from Open Street maps?
1035 epsg:
1036 The EPSG CRS to project to before plotting. Default is 9311, which
1037 is equal area. See https://epsg.io/9311.
1038 geo_label
1039 Name of a column in `gdf` that has labels for each geometry.
1040 geo_label_text_kwargs
1041 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}`
1042 kwargs
1043 Keyword args to pass to the plot.
1045 Returns
1046 -------
1047 ax of the plot.
1048 """
1049 if gdf.crs != 4269:
1050 logger.warning(
1051 "Expected map to have crs epsg:4269, but got %s instead.", gdf.crs
1052 )
1054 if do_relocate_ak_hi_pr:
1055 gdf = relocate_ak_hi_pr(gdf)
1056 else:
1057 # At least wrap the Aleutian islands.
1058 gdf.geometry = gdf.geometry.map(_wrap_polys)
1060 gdf = gdf.to_crs(epsg=epsg)
1062 ax = gdf.plot(*args, **kwargs)
1064 if geo_label is not None:
1065 _add_plot_map_geo_labels(ax, gdf, geo_label, geo_label_text_kwargs)
1067 if with_background:
1068 provider = cx.providers.OpenStreetMap.Mapnik
1070 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider)
1072 ax.tick_params(
1073 left=False,
1074 right=False,
1075 bottom=False,
1076 labelleft=False,
1077 labelbottom=False,
1078 )
1080 return ax
1083def plot_us_boundary(
1084 gdf: gpd.GeoDataFrame,
1085 *args,
1086 do_relocate_ak_hi_pr: bool = True,
1087 with_background: bool = False,
1088 epsg: int = 9311,
1089 **kwargs,
1090):
1091 """
1092 Plot a map of boundaries the US with AK and HI relocated.
1094 This function is very much like :py:func:`~plot_us` except
1095 that it plots only the boundaries of geometries.
1097 Note: the expectation is that the crs or the incoming geo data frame
1098 is EPSG:4269 or something that closely approximates it, in units of
1099 degrees of latitude and longitude. If this is not the case, results
1100 are unpredictable.
1102 Parameters
1103 ----------
1104 gdf
1105 The geometries to be plotted.
1106 args
1107 Args to pass to the plot.
1108 do_relocate_ak_hi_pr
1109 If `True` try to relocate AK, HI, and PR. Otherwise, still wrap
1110 the Aleutian islands west of -180° longitude if present.
1111 with_background
1112 Should we put in a background map from Open Street maps?
1113 epsg:
1114 The EPSG CRS to project to before plotting. Default is 9311, which
1115 is equal area. See https://epsg.io/9311.
1116 kwargs
1117 Keyword args to pass to the plot.
1119 Returns
1120 -------
1121 ax of the plot.
1122 """
1123 if gdf.crs != 4269:
1124 logger.warning(
1125 "Expected map to have crs epsg:4269, but got %d instead.", gdf.crs
1126 )
1128 if do_relocate_ak_hi_pr and with_background:
1129 logger.warning(
1130 "do_relocate_ak_hi_pr and with_background should not be used together. "
1131 "Undesired results are likely."
1132 )
1134 if do_relocate_ak_hi_pr:
1135 gdf = relocate_ak_hi_pr(gdf)
1136 else:
1137 # At least wrap the Aleutian islands.
1138 gdf = gdf.copy()
1139 gdf.geometry = gdf.geometry.map(_wrap_polys)
1141 gdf = gdf.to_crs(epsg=epsg)
1143 ax = gdf.boundary.plot(*args, **kwargs)
1145 if with_background:
1146 provider = cx.providers.OpenStreetMap.Mapnik
1148 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider)
1150 return ax
1153def geographic_centroids(gdf: gpd.GeoDataFrame) -> gpd.GeoSeries:
1154 """
1155 Compute the centroid of a geography.
1157 We do this by projecting to epsg 3857 (https://epsg.io/3857),
1158 computing the centroid, and then projecting back. This gives
1159 a reasonable answer for most geometries and avoids warnings from
1160 `GeoPandas`.
1162 Parameters
1163 ----------
1164 gdf
1165 A geo data frame in any crs.
1167 Returns
1168 -------
1169 A geo data series of the centroids of all the geometries in
1170 `gdf`.
1171 """
1172 crs = gdf.crs
1174 projected_centroids = gdf.geometry.to_crs(epsg=3857).geometry.centroid
1175 centroids = projected_centroids.to_crs(crs)
1177 return centroids
1180def sjoin_mostly_contains(
1181 gdf_large_geos: gpd.GeoDataFrame,
1182 gdf_small_geos: gpd.GeoDataFrame,
1183 large_suffix: str = "large",
1184 small_suffix: str = "small",
1185 area_threshold: float = 0.8,
1186 area_epsg: int = 3857,
1187):
1188 """
1189 Spatial join based on fraction of contained area.
1191 This function is designed to implement the common case
1192 where we have a number of small geo areas like census
1193 tracts or block groups in a large area like a CBSA. The
1194 reason to use this instead of `gpd.GeoDataFrame.sjoin`
1195 directly is that the smaller geos may not all be
1196 strictly contained in the bounds of the larger geos.
1197 And small geos outside the bounds of the larger one
1198 may intersect along the boundary. So instead, this method
1199 looks for small geos whose area is at least 80%
1200 (or another chosen number) within the larger area,
1202 Parameters
1203 ----------
1204 gdf_large_geos
1205 A geo data frame of one or more large geo areas like CBSAs.
1206 gdf_small_geos
1207 A geo data frame of smaller areas like census tracts.
1208 large_suffix
1209 Suffix to add to column names from the large side when
1210 the same name appears in both.
1211 small_suffix
1212 Suffix to add to column names from the small side when
1213 the same name appears in both.
1214 area_threshold
1215 The fraction of each smaller area that must be covered by
1216 one of the large areas to be joined with it.
1217 area_epsg
1218 The CRS to use project to before doing area calculations.
1219 Defaults to 3857. (https://epsg.io/3857),
1221 Returns
1222 -------
1223 Geo data frame of the spatially joined results.
1224 """
1225 if gdf_large_geos.crs != gdf_small_geos.crs:
1226 raise ValueError(
1227 "Can only join geometries of the same crs. "
1228 f"Got {gdf_large_geos.crs} and {gdf_small_geos.crs}"
1229 )
1231 # Keep the original geos around in EPSG 3857 so
1232 # we can check intersection areas.
1233 gdf_large_geos[f"_original_large_geos_{area_epsg}"] = (
1234 gdf_large_geos.geometry.to_crs(epsg=area_epsg)
1235 )
1236 gdf_small_geos[f"_original_small_geos_{area_epsg}"] = (
1237 gdf_small_geos.geometry.to_crs(epsg=area_epsg)
1238 )
1240 # Do an intersection join.
1241 gdf_intersection = gdf_small_geos.sjoin(
1242 gdf_large_geos,
1243 how="inner",
1244 predicate="intersects",
1245 lsuffix=small_suffix,
1246 rsuffix=large_suffix,
1247 )
1249 # Filter down to only those where the area of the intersection
1250 # exceeds the threshold.
1251 gdf_results = gdf_intersection[
1252 gdf_intersection[f"_original_small_geos_{area_epsg}"]
1253 .intersection(gdf_intersection[f"_original_large_geos_{area_epsg}"])
1254 .area
1255 >= area_threshold * gdf_intersection[f"_original_small_geos_{area_epsg}"].area
1256 ]
1258 gdf_results = gdf_results.drop(
1259 [f"_original_small_geos_{area_epsg}", f"_original_large_geos_{area_epsg}"],
1260 axis="columns",
1261 ).copy()
1263 gdf_results = gdf_results[
1264 [col for col in gdf_results.columns if col != "geometry"] + ["geometry"]
1265 ]
1267 return gdf_results