Coverage for censusdis/maps.py: 94%

2"""

3Utilities for loading and rendering maps.

5This module relies on shapefiles from the US Census,

6which it downloads as needed and caches locally.

7"""

9import importlib.resources

10import shutil

11from logging import getLogger

12from pathlib import Path

13from typing import Dict, Optional, Tuple, Union, Any

14from zipfile import BadZipFile, ZipFile

16import contextily as cx

17import geopandas as gpd

18import matplotlib.pyplot as plt

19import pandas as pd

20import requests

21import shapely.affinity

22from haversine import haversine

23from shapely.geometry import MultiPolygon, Point, Polygon

24from shapely.geometry.base import BaseGeometry

25import matplotlib.patheffects as pe

27from censusdis.impl.exceptions import CensusApiException

28from censusdis.impl.fetch import certificates

29from censusdis.states import AK, HI, NAMES_FROM_IDS, PR

31logger = getLogger(__name__)

34class MapException(CensusApiException):

35 """An exception generated from `censusdis.maps` code."""

38class ShapeReader:

39 """

40 A class for reading shapefiles into GeoPandas GeoDataFrames.

42 See the demo notebooks for more details. The shapefiles need

43 to already have been downloaded to the local machine. We may

44 add a lazy option in the future that will fetch them if they

45 don't exist.

47 Parameters

48 ----------

49 shapefile_root

50 The location in the filesystem where shapefiles are stored.

51 year

52 The year we want shapefiles for,

53 auto_fetch

54 If `True` then fetch remote shape files as needed.

55 """

57 def __init__(

58 self,

59 shapefile_root: Optional[Union[str, Path]] = None,

60 year: int = 2020,

61 auto_fetch: bool = True,

62 ):

63 if shapefile_root is None:

64 shapefile_root = Path.home() / ".censusdis" / "data" / "shapefiles"

65 shapefile_root.mkdir(exist_ok=True, parents=True)

66 else:

67 shapefile_root = Path(shapefile_root)

69 self._shapefile_root = shapefile_root

70 self._year = year

71 self._auto_fetch = auto_fetch

73 @property

74 def shapefile_root(self) -> Path:

75 """The path at which shapefiles are cached locally."""

76 return self._shapefile_root

78 def _read_shapefile(

79 self, base_name: str, base_url: str, crs, timeout: int

80 ) -> gpd.GeoDataFrame:

81 """Read a shapefile."""

82 self._auto_fetch_file(base_name, base_url, timeout=timeout)

84 path = self._shapefile_full_path(base_name)

86 gdf = gpd.read_file(path)

87 if crs is not None:

88 gdf.to_crs(crs, inplace=True)

89 return gdf

91 def _shapefile_full_path(self, basename: str) -> Path:

92 """Construct the full path to a shapefile."""

93 path = self._shapefile_root / basename / f"{basename}.shp"

94 return path

96 def _2008_2009_tiger(self, prefix, shapefile_scope: str, suffix) -> Tuple[str, str]:

97 # Sometimes we have to do down into a named

98 # state directory.

99 if suffix in ["cosub", "tract", "bg"]:

100 state = shapefile_scope

101 state_name = NAMES_FROM_IDS[state].upper().replace(" ", "_")

102

103 base_url = f"https://www2.census.gov/geo/tiger/TIGER{self._year}/{state}_{state_name}"

104 name = f"{prefix}_{self._year}_{state}_{suffix}00"

105 else:

106 # TODO - what to do here?

107 base_url, name = self._through_2010_tiger(prefix, shapefile_scope, suffix)

108

109 return base_url, name

110

111 def _through_2010_tiger(self, prefix, shapefile_scope: str, suffix):

112 # Curiously, the server side puts the 2000 files under

113 # the TIGER2010 directory early in the path and early

114 # in the file name.

115 path_year = self._year

116 path_year = max(path_year, 2010)

117

118 base_url = f"https://www2.census.gov/geo/tiger/TIGER{path_year}/{suffix.upper()}/{self._year}"

119 name = f"{prefix}_{path_year}_{shapefile_scope}_{suffix}{str(self._year)[-2:]}"

120 return base_url, name

121

122 def _post_2010_tiger(self, prefix, shapefile_scope: str, suffix):

123 # Special case for whatever reason the US Census decided.

124 base_url = (

125 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/{suffix.upper()}"

126 )

127

128 if self._year == 2020 and suffix == "tabblock":

129 base_url = f"{base_url}20"

130 suffix = f"{suffix}20"

131

132 if self._year in [2020, 2021] and suffix == "puma":

133 suffix = f"{suffix}10"

134

135 name = f"{prefix}_{self._year}_{shapefile_scope}_{suffix}"

136 return base_url, name

137

138 def tiger_url(self, prefix, shapefile_scope, suffix) -> Tuple[str, str]:

139 """Construct a URL for a TIGER file."""

140 if self._year == 2010 or self._year < 2008:

141 return self._through_2010_tiger(prefix, shapefile_scope, suffix)

142 elif 2008 <= self._year <= 2009:

143 return self._2008_2009_tiger(prefix, shapefile_scope, suffix)

144 else:

145 return self._post_2010_tiger(prefix, shapefile_scope, suffix)

146

147 def _tiger(self, shapefile_scope: str, geography, crs, timeout: int):

148 prefix, suffix = ("tl", geography)

149

150 base_url, name = self.tiger_url(prefix, shapefile_scope, suffix)

151

152 gdf = self._read_shapefile(name, base_url, crs, timeout=timeout)

153

154 # Pull off the extra two digits of year that get tacked

155 # on in some cases.

156

157 def mapper(col: str) -> str:

158 if col.endswith(("20", "10", "00")):

159 return col[:-2]

160 return col

161

162 gdf.rename(mapper, axis="columns", inplace=True)

163

164 if "STATEFP" not in gdf.columns:

165 gdf["STATEFP"] = shapefile_scope

166

167 return gdf

168

169 # The summary level to use for each of the resolutions we

170 # support.

171 # See https://www2.census.gov/geo/tiger/GENZ2010/ReadMe.pdf

172 _CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010 = {

173 "state": "040",

174 "county": "050",

175 "cousub": "060",

176 "tract": "140",

177 "bg": "150",

178 }

179

180 def _through_2010_cb(

181 self, cartographic_scope: str, geography: str, resolution: str

182 ):

183 if geography not in self._CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010:

184 raise MapException(

185 "Don't know how to interpret geography '%s' for pre-2010 maps.",

186 geography,

187 )

188

189 summary_level = self._CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010[geography]

190

191 name = f"gz_{self._year}_{cartographic_scope}_{summary_level}_00_{resolution}"

192 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}"

193

194 return base_url, name

195

196 def _post_2010_cb(self, cartographic_scope: str, geography, resolution: str):

197 # May need to revise when 2020 PUMA is published.

198 if geography == "puma" and 2010 <= self._year < 2022:

199 geography = "puma10"

200 elif geography == "puma" and self._year >= 2022:

201 geography = "puma20"

202

203 name = f"cb_{self._year}_{cartographic_scope}_{geography}_{resolution}"

204

205 # The shp subdirectory did not appear until 2014.

206 if self._year >= 2014:

207 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}/shp"

208 else:

209 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}"

210

211 return base_url, name

212

213 def _cartographic_bound(

214 self, shapefile_scope, geography, resolution, crs, *, timeout: int

215 ) -> gpd.GeoDataFrame:

216 if self._year <= 2010:

217 base_url, name = self._through_2010_cb(

218 shapefile_scope, geography, resolution

219 )

220 else:

221 base_url, name = self._post_2010_cb(shapefile_scope, geography, resolution)

222

223 gdf = self._read_shapefile(name, base_url, crs, timeout=timeout)

224

225 # Some files on the server, like

226 # https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_050_00_500k.zip

227 # leave the 'FP' suffix of column names.

228 gdf.rename(

229 {

230 "STATE": "STATEFP",

231 "COUNTY": "COUNTYFP",

232 "TRACT": "TRACTCE",

233 "BLKGRP": "BLKGRPCE",

234 },

235 axis="columns",

236 inplace=True,

237 )

238

239 return gdf

240

241 def read_shapefile(

242 self, shapefile_scope: str, geography: str, crs=None, *, timeout: int = 30

243 ):

244 """

245 Read the geometries of geographies.

246

247 This method reads maps suitable

248 for use with geometric joins and queries of various types. If you are

249 only interested in plotting maps, the

250 :py:meth:`~ShapeReader.read_cb_shapefile` method may be more suitable.

251

252 The files are read from the US Census servers and cached locally.

253 They are in most cases the same files you can download manually from

254 https://www.census.gov/cgi-bin/geo/shapefiles/index.php.

255

256 Individual files the API may download follow a naming convention

257 that has evolved a bit over time. So for example a 2010 block group file

258 for New Jersey would be found at

259 https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_34_bg10.zip

260 whereas a similar file for 2020 would be at

261 https://www2.census.gov/geo/tiger/TIGER2020/BG/tl_2020_34_bg.zip.

262

263 This method knows many of the subtle changes that have occurred over the years,

264 so you should mostly not have to worry about them. It is unlikely it knows

265 them all, so please submit an

266 issue at https://github.com/vengroff/censusdis/issues if you find

267 otherwise.

268

269 Once read, the files are cached locally so that when we reuse the same

270 files we do not have to go back to the server.

271

272 Parameters

273 ----------

274 shapefile_scope

275 The geography that is covered by the entire shapefile. In some

276 cases, this is a state, e.g. `NJ`. For cases where files

277 are available for the entire country, the string `"us"` is typically

278 used. In some rare cases, like for the Alaska Native Regional Corporations

279 (``"anrc"``) geography, other strings like ``"02"`` are used.

280 See the dowload links at

281 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

282 if you need to debug issues with a given geography.

283 geography

284 The geography we want to download bounds for. Supported

285 geometries are `"state'`, `"county"`, `"cousub"` (county subdivision),

286 `"tract"`, and `"bg"` (block group). Other geometries as defined

287 by the US Census may work, but have not been thoroughly tested.

288 crs

289 The crs to make the file to. If `None`, use the default

290 crs of the shapefile. Setting this is useful if we plan

291 to merge the resulting `GeoDataFrame` with another so we

292 can make sure they use the same crs.

293 timeout

294 Time out limit (in seconds) for the remote call.

295

296 Returns

297 -------

298 A `gpd.GeoDataFrame` containing the requested

299 geometries.

300 """

301 return self._tiger(shapefile_scope, geography, crs, timeout=timeout)

302

303 def read_cb_shapefile(

304 self,

305 shapefile_scope: str,

306 geography: str,

307 resolution: str = "500k",

308 crs=None,

309 *,

310 timeout: int = 30,

311 ) -> gpd.GeoDataFrame:

312 """

313 Read the cartographic boundaries of a given geography.

314

315 These are smaller

316 files suited for plotting maps, as compared to those returned by

317 :py:meth:`~ShapeReader.read_shapefile`, which returns higher

318 resolution geometries.

319

320 The files are read from the US Census servers and cached locally.

321 They are in most cases the same files you can download manually from

322 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

323 or similar URLs for other years.

324

325 Individual files the API may download follow a naming convention

326 that has evolved a bit over time. So for example a 2010 census tract

327 cartographic bounds file

328 for New Jersey at 500,000:1 resolution would be found at

329 https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_34_140_00_500k.zip

330 whereas a similar file for 2020 would be at

331 https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_34_tract_500k.zip

332

333 This method knows many of the subtle changes that have occurred over the years,

334 so you should mostly not have to worry about them. It is unlikely it knows

335 them all, so please submit an

336 issue at https://github.com/vengroff/censusdis/issues if you find

337 otherwise.

338

339 Once read, the files are cached locally so that when we reuse the same

340 files we do not have to go back to the server.

341

342 Parameters

343 ----------

344 shapefile_scope

345 The geography that is covered by the entire shapefile. In some

346 cases, this is a state, e.g. `NJ`. For cases where files

347 are available for the entire country, the string `"us"` is typically

348 used. In some rare cases, like for the Alaska Native Regional Corporations

349 (``"anrc"``) geography, other strings like ``"02"`` are used.

350 See the dowload links at

351 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html

352 if you need to debug issues with a given geography.

353 geography

354 The geography we want to download bounds for. Supported

355 geometries are `"state'`, `"county"`, `"cousub"` (county subdivision),

356 `"tract"`, and `"bg"` (block group)

357 resolution

358 What resolution shapes should we use. Permitted options are

359 `"500k"`, `"5m"`, and `"20m"` for 1:500,000, 1:5,000,000, and

360 1:20,000,000 resolution respectively. Availability varies, but for

361 most geographies `"500k"` is available even if others are not.

362 crs

363 The crs to make the file to. If `None`, use the default

364 crs of the shapefile. Setting this is useful if we plan

365 to merge the resulting `GeoDataFrame` with another so we

366 can make sure they use the same crs.

367 timeout

368 Time out limit (in seconds) for the remote call.

369

370 Returns

371 -------

372 A `gpd.GeoDataFrame` containing the boundaries of the requested

373 geometries.

374 """

375 return self._cartographic_bound(

376 shapefile_scope, geography, resolution, crs, timeout=timeout

377 )

378

379 def try_cb_tiger_shapefile(

380 self,

381 shapefile_scope: str,

382 geography: str,

383 resolution: str = "500k",

384 crs=None,

385 *,

386 timeout: int = 60,

387 ) -> gpd.GeoDataFrame:

388 """

389 Try to retrieve CB file.

390

391 Wraps read_cb_shapefile and read_shapefile.

392

393 If unable to find, attempts to find the full tiger file.

394

395 Returns

396 -------

397 A `gpd.GeoDataFrame` containing the boundaries of the requested

398 geometries.

399 """

400 try:

401 gdf = self._cartographic_bound(

402 shapefile_scope, geography, resolution, crs, timeout=timeout

403 )

404 except MapException as e:

405 logger.debug("Exception loading cb file. Trying tiger instead.", e)

406 gdf = self._tiger(shapefile_scope, geography, crs, timeout=timeout)

407

408 return gdf

409

410 def _auto_fetch_file(self, name: str, base_url: str, *, timeout: int):

411 if not self._auto_fetch:

412 return

413

414 self._fetch_file(name, base_url, timeout=timeout)

415

416 def _url_for_file(self, name: str) -> str:

417 if name.startswith("cb_"):

418 return f"https://www2.census.gov/geo/tiger/GENZ{self._year}/shp/{name}.zip"

419

420 if name.startswith("tl_"):

421 suffix = name.split("_")[-1]

422

423 if self._year <= 2010:

424 return (

425 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/"

426 f"{suffix.upper()[:-2]}/{self._year}/{name}.zip"

427 )

428

429 return (

430 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/"

431 f"{suffix.upper()}/{name}.zip"

432 )

433

434 # This will not work, but it's the main download page where we

435 # can start to look for what we want.

436 return "https://www.census.gov/cgi-bin/geo/shapefiles/index.php"

437

438 def _fetch_file(

439 self,

440 name: str,

441 base_url: str,

442 *,

443 timeout: int,

444 ) -> None:

445 dir_path = self._shapefile_root / name

446

447 if dir_path.is_dir():

448 # Does it have the .shp file? If not maybe something

449 # random went wrong in the previous attempt, or someone

450 # deleted some stuff by mistake. So delete it and

451 # reload.

452 shp_path = dir_path / f"{name}.shp"

453 if shp_path.is_file():

454 # Looks like the shapefile is there.

455 return

456

457 # No shapefile so remove the whole directory and

458 # hope for the best when we recreate it.

459 shutil.rmtree(dir_path)

460

461 # Make the directory

462 dir_path.mkdir()

463

464 # We will put the zip file in the dir we just created.

465 zip_path = dir_path / f"{name}.zip"

466

467 # Construct the URL to get the zip file.

468 # url = self._url_for_file(name)

469 zip_url = f"{base_url}/{name}.zip"

470

471 # Fetch the zip file and write it.

472 response = requests.get(

473 zip_url,

474 timeout=timeout,

475 cert=certificates.map_cert,

476 verify=certificates.map_verify,

477 )

478

479 if response.status_code == 404:

480 raise MapException(

481 f"{zip_url} was not found. "

482 "The Census Bureau may not publish the shapefile you are looking for for the given year. "

483 "Or the file you are looking for may be from a year where a naming convention that censusdis "

484 "does not recognize was used."

485 )

486

487 headers = response.headers

488 content_type = headers.get("Content-Type", None)

489

490 if content_type != "application/zip":

491 raise MapException(

492 f"Expected content type application/zip' from {zip_url}, but got '{content_type}' instead."

493 )

494

495 with zip_path.open("wb") as file:

496 file.write(response.content)

497

498 # Unzip the file and extract all contents.

499 try:

500 with ZipFile(zip_path) as zip_file:

501 zip_file.extractall(dir_path)

502 except BadZipFile as exc:

503 raise MapException(f"Bad zip file retrieved from {zip_url}") from exc

504 finally:

505 # We don't need the zipfile anymore.

506 zip_path.unlink()

507

508

509def clip_to_states(gdf, gdf_bounds):

510 """

511 Clip every geometry in a gdf to the state it belongs to, from the states in the state bounds.

512

513 We clip to state bounds so that we don't plot areas

514 outside the state. Typically, this clips areas that

515 extend out into the water in coastal areas so we don't

516 get strange artifacts in the water in plots.

517

518 The way we tell what state an input geometry belongs to

519 is by looking at the `STATEFP` column for that geometry's

520 row in the input.

521

522 Parameters

523 ----------

524 gdf

525 The input geometries.

526 gdf_bounds

527 The state bounds.

528

529 Returns

530 -------

531 The input geometries where each is clipped to the bounds

532 of the state to which it belongs.

533 """

534 return (

535 gdf.groupby(gdf.STATEFP)

536 .apply(lambda s: gpd.clip(s, gdf_bounds[gdf_bounds.STATEFP == s.name]))

537 .droplevel("STATEFP")

538 )

539

540

541def _wrap_poly(poly: Union[Polygon, Point]):

542 """

543 Move a polygon.

544

545 Used in shifting AK and HI geometries.

546 """

547 if isinstance(poly, Polygon):

548 x_coord, _ = poly.exterior.coords.xy

549 elif isinstance(poly, Point):

550 x_coord = [poly.x]

551 else:

552 # Not sure how to parse it, so leave it

553 # where it is.

554 logger.warning(f"Unrecognized type {type(poly)} can't be wrapped.")

555 return poly

556

557 if x_coord[0] > 0:

558 poly = shapely.affinity.translate(poly, xoff=-360.0, yoff=0.0)

559 return poly

560

561

562def _wrap_polys(polys):

563 """

564 Move the polygons in a multi-polygon.

565

566 Used in shifting AK and HI geometries.

567 """

568 # Just in case it's not a MultiPolygon

569 if not isinstance(polys, MultiPolygon):

570 return _wrap_poly(polys)

571 wrapped_polys = [_wrap_poly(p) for p in polys.geoms]

572 return MultiPolygon(wrapped_polys)

573

574

575# Boxes that contain AK and HI after _wrap_polys has

576# been applied to it. We use this to identify

577# geometries that we want to relocate in relocate_ak_hi

578# when we don't have a STATEFP or STATE column to help

579# identify what is in AK or HI.

580

581_AK_MIN_X = -188.0

582_AK_MIN_Y = 51.0

583_AK_MAX_X = -129.0

584_AK_MAX_Y = 72.0

585

586_AK_BOUNDS = Polygon(

587 (

588 (_AK_MIN_X, _AK_MIN_Y),

589 (_AK_MAX_X, _AK_MIN_Y),

590 (_AK_MAX_X, _AK_MAX_Y),

591 (_AK_MIN_X, _AK_MAX_Y),

592 (_AK_MIN_X, _AK_MIN_Y),

593 )

594)

595

596_HI_MIN_X = -179.0

597_HI_MIN_Y = 18.0

598_HI_MAX_X = -154.0

599_HI_MAX_Y = 29.0

600

601_HI_BOUNDS = Polygon(

602 (

603 (_HI_MIN_X, _HI_MIN_Y),

604 (_HI_MAX_X, _HI_MIN_Y),

605 (_HI_MAX_X, _HI_MAX_Y),

606 (_HI_MIN_X, _HI_MAX_Y),

607 (_HI_MIN_X, _HI_MIN_Y),

608 )

609)

610

611_PR_MIN_X = -68.0

612_PR_MIN_Y = 17.0

613_PR_MAX_X = -65.0

614_PR_MAX_Y = 19.0

615

616_PR_BOUNDS = Polygon(

617 (

618 (_PR_MIN_X, _PR_MIN_Y),

619 (_PR_MAX_X, _PR_MIN_Y),

620 (_PR_MAX_X, _PR_MAX_Y),

621 (_PR_MIN_X, _PR_MAX_Y),

622 (_PR_MIN_X, _PR_MIN_Y),

623 )

624)

625

626

627def _relocate_ak(geo: BaseGeometry) -> BaseGeometry:

628 """

629 Relocate a geometry that is already known to be in the AK bounding box.

630

631 Parameters

632 ----------

633 geo

634 The geometry.

635

636 Returns

637 -------

638 The relocated geometry.

639 """

640 ak_scale_x = 0.25

641 ak_scale_y = 0.4

642 ak_x = 33

643 ak_y = -34

644 ak_origin = (-149.9003, 61.2181) # Anchorage

645 geo = shapely.affinity.scale(

646 geo, xfact=ak_scale_x, yfact=ak_scale_y, origin=ak_origin

647 )

648 geo = shapely.affinity.translate(geo, xoff=ak_x, yoff=ak_y)

649

650 return geo

651

652

653def _relocate_hi(geo: BaseGeometry) -> BaseGeometry:

654 """

655 Relocate a geometry that is already known to be in the HI bounding box.

656

657 Parameters

658 ----------

659 geo

660 The geometry.

661

662 Returns

663 -------

664 The relocated geometry.

665 """

666 hi_x = 50

667 hi_y = 6

668

669 geo = shapely.affinity.translate(geo, xoff=hi_x, yoff=hi_y)

670

671 return geo

672

673

674def _relocate_pr(geo: BaseGeometry) -> BaseGeometry:

675 """

676 Relocate a geometry that is already known to be in the PR bounding box.

677

678 Parameters

679 ----------

680 geo

681 The geometry.

682

683 Returns

684 -------

685 The relocated geometry.

686 """

687 pr_x = -7

688 pr_y = 8

689

690 geo = shapely.affinity.translate(geo, xoff=pr_x, yoff=pr_y)

691

692 return geo

693

694

695def _relocate_parts_in_ak_hi_pr(geo: BaseGeometry) -> BaseGeometry:

696 """

697 Relocate any sub-geometries that happen to fall in the AK or HI or PR bounding boxes.

698

699 If the geometry is a simple polygon, check if it intersects the

700 bounding boxes of AK or HI and relocate if so. If it is a

701 `MultiPolygon` then recurse in and relocate some

702 contained geometries as appropriate. This way it can work on small

703 polygons completely contained in the bounding box, or on larger

704 multi-polygons like regions that may have some polygons in the bounding

705 box and others outside it.

706

707 Parameters

708 ----------

709 geo

710 The geography.

711

712 Returns

713 -------

714 The geography, possibly with some parts relocated.

715 """

716 if isinstance(geo, MultiPolygon):

717 relocated_geos = [_relocate_parts_in_ak_hi_pr(g) for g in geo.geoms]

718 return MultiPolygon(relocated_geos)

719

720 # It is an individual polygon. So see if it is

721 # in a box that should be relocated.

722

723 if geo.intersects(_AK_BOUNDS):

724 geo = _relocate_ak(geo)

725 elif geo.intersects(_HI_BOUNDS):

726 geo = _relocate_hi(geo)

727 elif geo.intersects(_PR_BOUNDS):

728 geo = _relocate_pr(geo)

729

730 return geo

731

732

733def _wrap_and_relocate_geos(geo: BaseGeometry):

734 geo = _wrap_polys(geo)

735 return _relocate_parts_in_ak_hi_pr(geo)

736

737

738def _relocate_ak_hi_pr_group(group):

739 """

740 Relocate a group of geometries.

741

742 They are relocated if they belong to AK, HI or PR, otherwise

743 they are left alone.

744 """

745 if group.name == AK:

746 # Deal with the Aleutian islands wrapping at -180/180 longitude.

747 group.geometry = group.geometry.apply(_wrap_polys)

748 # Relocate

749 group.geometry = group.geometry.apply(_relocate_ak)

750 elif group.name == HI:

751 group.geometry = group.geometry.apply(_relocate_hi)

752 elif group.name == PR:

753 group.geometry = group.geometry.apply(_relocate_pr)

754

755 return group

756

757

758def relocate_ak_hi_pr(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:

759 """

760 Relocate any geometry that is in Alaska or Hawaii for plotting purposes.

761

762 We first try an optimization. If there is a `STATEFP`

763 column then we relocate rows where that column has a value of

764 `AK`, `HI` or `PR`. If there is not a

765 `STATEFP` column we check for a `STATE` column and do the same. If neither

766 column exists then we dig down into the geometries themselves

767 and relocate those that intersect bounding rectangles of the

768 two states.

769

770 Note: the expectation is that the crs or the incoming geo data frame

771 is EPSG:4269 or something that closely approximates it, in units of

772 degrees of latitude and longitude. If this is not the case, results

773 are unpredictable.

774

775 Parameters

776 ----------

777 gdf

778 the geo data frame to relocate.

779

780 Returns

781 -------

782 a geo data frame with any geometry in AK or HI moved for plotting.

783 """

784 if "STATEFP" in gdf.columns or "STATE" in gdf.columns:

785 # There is a column idenfyig the state of each geometry

786 # so use that to decide what to relocate.

787 if "STATEFP" in gdf.columns:

788 group_column = "STATEFP"

789 else:

790 group_column = "STATE"

791

792 gdf = (

793 gdf.groupby(gdf[group_column], group_keys=True, sort=False)

794 .apply(

795 _relocate_ak_hi_pr_group,

796 include_groups=False,

797 )

798 .reset_index(level=1, drop=True)

799 .reset_index(drop=False)

800 )

801 else:

802 # There is no column indicating the state of each geometry. This

803 # is often because the geometries span states. So we can't easily

804 # relocate the two states, but we least wrap the Aleutian

805 # islands if present and then relocate any geometries that are

806 # in the bounding boxes of AK and HI.

807 gdf = gdf.copy()

808 gdf.geometry = gdf.geometry.map(_wrap_and_relocate_geos)

809

810 return gdf

811

812

813__gdf_crs_bounds: Optional[gpd.GeoDataFrame] = None

814"""The bounds of the all CRSs we might use in `plot_map`."""

815

816

817def _gdf_crs_bounds() -> gpd.GeoDataFrame:

818 """

819 Construct a dataframe witn the bound of all the CRSs we might use in `plot_map`.

820

821 Returns

822 -------

823 The dataframe. It is a singleton you should not modify.

824 """

825 global __gdf_crs_bounds

826

827 if __gdf_crs_bounds is None:

828 with importlib.resources.path(

829 f"{__package__}.resources", "crs_bounds.geojson"

830 ) as path:

831 __gdf_crs_bounds = gpd.GeoDataFrame.from_file(path)

832

833 return __gdf_crs_bounds

834

835

836def _closest_epsg(

837 gdf: gpd.GeoDataFrame,

838) -> int:

839 """

840 Find the EPSG to use by choosing the one closest to the center of the bounds of the gdf.

841

842 We do this by looking at haversine distance between the

843 centers. I am sure we could do this more efficiently

844 than a linear scan using a data structure optimized for

845 nearest neighbor queries on the surface of a sphere.

846 But we only have about 100 candidates, so the linear scan is

847 simple and not terrible.

848

849 Parameters

850 ----------

851 gdf

852 The gdf we want to plot.

853

854 Returns

855 -------

856 The EPSG number.

857 """

858 gdf_crs_bounds = _gdf_crs_bounds()

859

860 if gdf.crs != gdf_crs_bounds.crs:

861 gdf = gdf.copy(deep=True).to_crs(gdf_crs_bounds.crs)

862

863 total_bounds = gdf.total_bounds

864

865 # Wrap the end of the Aleutian island chain.

866 if total_bounds[0] > 0:

867 total_bounds[0] = total_bounds[0] - 360.0

868 if total_bounds[2] > 0:

869 total_bounds[2] = total_bounds[2] - 360.0

870

871 lon, lat = (

872 (total_bounds[0] + total_bounds[2]) / 2,

873 (total_bounds[1] + total_bounds[3]) / 2,

874 )

875

876 centers = gdf_crs_bounds.representative_point()

877

878 epsg = gdf_crs_bounds["epsg"].iloc[

879 centers.map(lambda center: haversine((center.y, center.x), (lat, lon))).argmin()

880 ]

881

882 return epsg

883

884

885def plot_map(

886 gdf: gpd.GeoDataFrame,

887 *args,

888 with_background: bool = False,

889 epsg: Optional[int] = None, # 3309, # 4269,

890 geo_label: Optional[Union[str, pd.Series]] = None,

891 geo_label_text_kwargs: Optional[Dict[str, Any]] = None,

892 **kwargs,

893) -> plt.Axes:

894 """

895 Plot a map, optionally with a background.

896

897 Parameters

898 ----------

899 gdf

900 The geo data frame to plot

901 args

902 Optional args to matplotlib

903 with_background

904 Should we put in a background map from Open Street maps?

905 epsg

906 The EPSG to project to. Otherwise a suitable one for the

907 geometry will be inferred.

908 geo_label

909 Name of a column in `gdf` that has labels for each geometry.

910 geo_label_text_kwargs

911 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}`

912 kwargs

913 keyword args to pass on to matplotlib

914

915 Returns

916 -------

917 The ax of the resulting plot.

918 """

919 if epsg is None:

920 epsg = _closest_epsg(gdf)

921

922 gdf = gdf.to_crs(epsg=epsg)

923

924 ax = gdf.plot(*args, **kwargs)

925

926 if geo_label is not None:

927 _add_plot_map_geo_labels(ax, gdf, geo_label, geo_label_text_kwargs)

928

929 ax.tick_params(

930 left=False,

931 right=False,

932 bottom=False,

933 labelleft=False,

934 labelbottom=False,

935 )

936

937 if with_background:

938 provider = cx.providers.OpenStreetMap.Mapnik

939

940 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider)

941

942 return ax

943

944

945def _add_plot_map_geo_labels(

946 ax: plt.Axes,

947 gdf: gpd.GeoDataFrame,

948 geo_label: Union[str, pd.Series],

949 geo_label_text_kwargs: Optional[Dict[str, Any]],

950) -> plt.Axes:

951 """

952 Add labels to each geometry in a plot.

953

954 Internal utility function behind `plot_map` and `plot_us`.

955

956 Parameters

957 ----------

958 ax

959 The ax we already plotted on.

960 gdf

961 The geographic data frame we are plotting.

962 geo_label

963 The column in `gdf` that contains the labels to add to the plot.

964 geo_label_text_kwargs

965 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}`

966

967 Returns

968 -------

969 `ax`

970 """

971 if geo_label_text_kwargs is None:

972 geo_label_text_kwargs = {}

973

974 default_text_kwargs = dict(

975 ha="center",

976 va="center",

977 color="#333",

978 size=9,

979 path_effects=[pe.withStroke(linewidth=4, foreground="white")],

980 )

981

982 geo_label_text_kwargs = dict(**default_text_kwargs) | geo_label_text_kwargs

983

984 for idx, row in gdf.iterrows():

985 rep = row["geometry"].representative_point()

986

987 if isinstance(geo_label, str):

988 name = row[geo_label]

989 else:

990 name = geo_label.loc[idx]

991

992 ax.text(rep.x, rep.y, name, **geo_label_text_kwargs)

993

994 return ax

995

996

997def plot_us(

998 gdf: gpd.GeoDataFrame,

999 *args,

1000 do_relocate_ak_hi_pr: bool = True,

1001 with_background: bool = False,

1002 epsg: int = 9311,

1003 geo_label: Optional[Union[str, pd.Series]] = None,

1004 geo_label_text_kwargs: Optional[Dict[str, Any]] = None,

1005 **kwargs,

1006) -> plt.Axes:

1007 """

1008 Plot a map of the US with AK and HI relocated.

1009

1010 This function will move and scale AK and

1011 HI so that they are plotted at the lower left of the other 48 states,

1012 just below CA, AZ, and NM.

1013

1014 It also moves the Aleutian islands that are west of -180° longitude

1015 so that they are plotted next to the rest of AK. Otherwise, they

1016 tend to be plotted at longitudes just less than +180°, which

1017 creates visual discontinuities.

1018

1019 Note: the expectation is that the crs or the incoming geo data frame

1020 is EPSG:4269 or something that closely approximates it, in units of

1021 degrees of latitude and longitude. If this is not the case, results

1022 are unpredictable.

1023

1024 Parameters

1025 ----------

1026 gdf

1027 The geometries to be plotted.

1028 do_relocate_ak_hi_pr

1029 If `True` try to relocate AK, HI, and PR. Otherwise, still wrap

1030 the Aleutian islands west of -180° longitude if present.

1031 args

1032 Args to pass to the plot.

1033 with_background

1034 Should we put in a background map from Open Street maps?

1035 epsg:

1036 The EPSG CRS to project to before plotting. Default is 9311, which

1037 is equal area. See https://epsg.io/9311.

1038 geo_label

1039 Name of a column in `gdf` that has labels for each geometry.

1040 geo_label_text_kwargs

1041 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}`

1042 kwargs

1043 Keyword args to pass to the plot.

1044

1045 Returns

1046 -------

1047 ax of the plot.

1048 """

1049 if gdf.crs != 4269:

1050 logger.warning(

1051 "Expected map to have crs epsg:4269, but got %s instead.", gdf.crs

1052 )

1053

1054 if do_relocate_ak_hi_pr:

1055 gdf = relocate_ak_hi_pr(gdf)

1056 else:

1057 # At least wrap the Aleutian islands.

1058 gdf.geometry = gdf.geometry.map(_wrap_polys)

1059

1060 gdf = gdf.to_crs(epsg=epsg)

1061

1062 ax = gdf.plot(*args, **kwargs)

1063

1064 if geo_label is not None:

1065 _add_plot_map_geo_labels(ax, gdf, geo_label, geo_label_text_kwargs)

1066

1067 if with_background:

1068 provider = cx.providers.OpenStreetMap.Mapnik

1069

1070 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider)

1071

1072 ax.tick_params(

1073 left=False,

1074 right=False,

1075 bottom=False,

1076 labelleft=False,

1077 labelbottom=False,

1078 )

1079

1080 return ax

1081

1082

1083def plot_us_boundary(

1084 gdf: gpd.GeoDataFrame,

1085 *args,

1086 do_relocate_ak_hi_pr: bool = True,

1087 with_background: bool = False,

1088 epsg: int = 9311,

1089 **kwargs,

1090):

1091 """

1092 Plot a map of boundaries the US with AK and HI relocated.

1093

1094 This function is very much like :py:func:`~plot_us` except

1095 that it plots only the boundaries of geometries.

1096

1097 Note: the expectation is that the crs or the incoming geo data frame

1098 is EPSG:4269 or something that closely approximates it, in units of

1099 degrees of latitude and longitude. If this is not the case, results

1100 are unpredictable.

1101

1102 Parameters

1103 ----------

1104 gdf

1105 The geometries to be plotted.

1106 args

1107 Args to pass to the plot.

1108 do_relocate_ak_hi_pr

1109 If `True` try to relocate AK, HI, and PR. Otherwise, still wrap

1110 the Aleutian islands west of -180° longitude if present.

1111 with_background

1112 Should we put in a background map from Open Street maps?

1113 epsg:

1114 The EPSG CRS to project to before plotting. Default is 9311, which

1115 is equal area. See https://epsg.io/9311.

1116 kwargs

1117 Keyword args to pass to the plot.

1118

1119 Returns

1120 -------

1121 ax of the plot.

1122 """

1123 if gdf.crs != 4269:

1124 logger.warning(

1125 "Expected map to have crs epsg:4269, but got %d instead.", gdf.crs

1126 )

1127

1128 if do_relocate_ak_hi_pr and with_background:

1129 logger.warning(

1130 "do_relocate_ak_hi_pr and with_background should not be used together. "

1131 "Undesired results are likely."

1132 )

1133

1134 if do_relocate_ak_hi_pr:

1135 gdf = relocate_ak_hi_pr(gdf)

1136 else:

1137 # At least wrap the Aleutian islands.

1138 gdf = gdf.copy()

1139 gdf.geometry = gdf.geometry.map(_wrap_polys)

1140

1141 gdf = gdf.to_crs(epsg=epsg)

1142

1143 ax = gdf.boundary.plot(*args, **kwargs)

1144

1145 if with_background:

1146 provider = cx.providers.OpenStreetMap.Mapnik

1147

1148 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider)

1149

1150 return ax

1151

1152

1153def geographic_centroids(gdf: gpd.GeoDataFrame) -> gpd.GeoSeries:

1154 """

1155 Compute the centroid of a geography.

1156

1157 We do this by projecting to epsg 3857 (https://epsg.io/3857),

1158 computing the centroid, and then projecting back. This gives

1159 a reasonable answer for most geometries and avoids warnings from

1160 `GeoPandas`.

1161

1162 Parameters

1163 ----------

1164 gdf

1165 A geo data frame in any crs.

1166

1167 Returns

1168 -------

1169 A geo data series of the centroids of all the geometries in

1170 `gdf`.

1171 """

1172 crs = gdf.crs

1173

1174 projected_centroids = gdf.geometry.to_crs(epsg=3857).geometry.centroid

1175 centroids = projected_centroids.to_crs(crs)

1176

1177 return centroids

1178

1179

1180def sjoin_mostly_contains(

1181 gdf_large_geos: gpd.GeoDataFrame,

1182 gdf_small_geos: gpd.GeoDataFrame,

1183 large_suffix: str = "large",

1184 small_suffix: str = "small",

1185 area_threshold: float = 0.8,

1186 area_epsg: int = 3857,

1187):

1188 """

1189 Spatial join based on fraction of contained area.

1190

1191 This function is designed to implement the common case

1192 where we have a number of small geo areas like census

1193 tracts or block groups in a large area like a CBSA. The

1194 reason to use this instead of `gpd.GeoDataFrame.sjoin`

1195 directly is that the smaller geos may not all be

1196 strictly contained in the bounds of the larger geos.

1197 And small geos outside the bounds of the larger one

1198 may intersect along the boundary. So instead, this method

1199 looks for small geos whose area is at least 80%

1200 (or another chosen number) within the larger area,

1201

1202 Parameters

1203 ----------

1204 gdf_large_geos

1205 A geo data frame of one or more large geo areas like CBSAs.

1206 gdf_small_geos

1207 A geo data frame of smaller areas like census tracts.

1208 large_suffix

1209 Suffix to add to column names from the large side when

1210 the same name appears in both.

1211 small_suffix

1212 Suffix to add to column names from the small side when

1213 the same name appears in both.

1214 area_threshold

1215 The fraction of each smaller area that must be covered by

1216 one of the large areas to be joined with it.

1217 area_epsg

1218 The CRS to use project to before doing area calculations.

1219 Defaults to 3857. (https://epsg.io/3857),

1220

1221 Returns

1222 -------

1223 Geo data frame of the spatially joined results.

1224 """

1225 if gdf_large_geos.crs != gdf_small_geos.crs:

1226 raise ValueError(

1227 "Can only join geometries of the same crs. "

1228 f"Got {gdf_large_geos.crs} and {gdf_small_geos.crs}"

1229 )

1230

1231 # Keep the original geos around in EPSG 3857 so

1232 # we can check intersection areas.

1233 gdf_large_geos[f"_original_large_geos_{area_epsg}"] = (

1234 gdf_large_geos.geometry.to_crs(epsg=area_epsg)

1235 )

1236 gdf_small_geos[f"_original_small_geos_{area_epsg}"] = (

1237 gdf_small_geos.geometry.to_crs(epsg=area_epsg)

1238 )

1239

1240 # Do an intersection join.

1241 gdf_intersection = gdf_small_geos.sjoin(

1242 gdf_large_geos,

1243 how="inner",

1244 predicate="intersects",

1245 lsuffix=small_suffix,

1246 rsuffix=large_suffix,

1247 )

1248

1249 # Filter down to only those where the area of the intersection

1250 # exceeds the threshold.

1251 gdf_results = gdf_intersection[

1252 gdf_intersection[f"_original_small_geos_{area_epsg}"]

1253 .intersection(gdf_intersection[f"_original_large_geos_{area_epsg}"])

1254 .area

1255 >= area_threshold * gdf_intersection[f"_original_small_geos_{area_epsg}"].area

1256 ]

1257

1258 gdf_results = gdf_results.drop(

1259 [f"_original_small_geos_{area_epsg}", f"_original_large_geos_{area_epsg}"],

1260 axis="columns",

1261 ).copy()

1262

1263 gdf_results = gdf_results[

1264 [col for col in gdf_results.columns if col != "geometry"] + ["geometry"]

1265 ]

1266

1267 return gdf_results