Coverage for censusdis/maps.py: 94%

331 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-04-03 05:39 +0000

1# Copyright (c) 2022 Darren Erik Vengroff 

2""" 

3Utilities for loading and rendering maps. 

4 

5This module relies on shapefiles from the US Census, 

6which it downloads as needed and caches locally. 

7""" 

8 

9import importlib.resources 

10import shutil 

11from logging import getLogger 

12from pathlib import Path 

13from typing import Dict, Optional, Tuple, Union, Any 

14from zipfile import BadZipFile, ZipFile 

15 

16import contextily as cx 

17import geopandas as gpd 

18import matplotlib.pyplot as plt 

19import pandas as pd 

20import requests 

21import shapely.affinity 

22from haversine import haversine 

23from shapely.geometry import MultiPolygon, Point, Polygon 

24from shapely.geometry.base import BaseGeometry 

25import matplotlib.patheffects as pe 

26 

27from censusdis.impl.exceptions import CensusApiException 

28from censusdis.impl.fetch import certificates 

29from censusdis.states import AK, HI, NAMES_FROM_IDS, PR 

30 

31logger = getLogger(__name__) 

32 

33 

34class MapException(CensusApiException): 

35 """An exception generated from `censusdis.maps` code.""" 

36 

37 

38class ShapeReader: 

39 """ 

40 A class for reading shapefiles into GeoPandas GeoDataFrames. 

41 

42 See the demo notebooks for more details. The shapefiles need 

43 to already have been downloaded to the local machine. We may 

44 add a lazy option in the future that will fetch them if they 

45 don't exist. 

46 

47 Parameters 

48 ---------- 

49 shapefile_root 

50 The location in the filesystem where shapefiles are stored. 

51 year 

52 The year we want shapefiles for, 

53 auto_fetch 

54 If `True` then fetch remote shape files as needed. 

55 """ 

56 

57 def __init__( 

58 self, 

59 shapefile_root: Optional[Union[str, Path]] = None, 

60 year: int = 2020, 

61 auto_fetch: bool = True, 

62 ): 

63 if shapefile_root is None: 

64 shapefile_root = Path.home() / ".censusdis" / "data" / "shapefiles" 

65 shapefile_root.mkdir(exist_ok=True, parents=True) 

66 else: 

67 shapefile_root = Path(shapefile_root) 

68 

69 self._shapefile_root = shapefile_root 

70 self._year = year 

71 self._auto_fetch = auto_fetch 

72 

73 @property 

74 def shapefile_root(self) -> Path: 

75 """The path at which shapefiles are cached locally.""" 

76 return self._shapefile_root 

77 

78 def _read_shapefile( 

79 self, base_name: str, base_url: str, crs, timeout: int 

80 ) -> gpd.GeoDataFrame: 

81 """Read a shapefile.""" 

82 self._auto_fetch_file(base_name, base_url, timeout=timeout) 

83 

84 path = self._shapefile_full_path(base_name) 

85 

86 gdf = gpd.read_file(path) 

87 if crs is not None: 

88 gdf.to_crs(crs, inplace=True) 

89 return gdf 

90 

91 def _shapefile_full_path(self, basename: str) -> Path: 

92 """Construct the full path to a shapefile.""" 

93 path = self._shapefile_root / basename / f"{basename}.shp" 

94 return path 

95 

96 def _2008_2009_tiger(self, prefix, shapefile_scope: str, suffix) -> Tuple[str, str]: 

97 # Sometimes we have to do down into a named 

98 # state directory. 

99 if suffix in ["cosub", "tract", "bg"]: 

100 state = shapefile_scope 

101 state_name = NAMES_FROM_IDS[state].upper().replace(" ", "_") 

102 

103 base_url = f"https://www2.census.gov/geo/tiger/TIGER{self._year}/{state}_{state_name}" 

104 name = f"{prefix}_{self._year}_{state}_{suffix}00" 

105 else: 

106 # TODO - what to do here? 

107 base_url, name = self._through_2010_tiger(prefix, shapefile_scope, suffix) 

108 

109 return base_url, name 

110 

111 def _through_2010_tiger(self, prefix, shapefile_scope: str, suffix): 

112 # Curiously, the server side puts the 2000 files under 

113 # the TIGER2010 directory early in the path and early 

114 # in the file name. 

115 path_year = self._year 

116 path_year = max(path_year, 2010) 

117 

118 base_url = f"https://www2.census.gov/geo/tiger/TIGER{path_year}/{suffix.upper()}/{self._year}" 

119 name = f"{prefix}_{path_year}_{shapefile_scope}_{suffix}{str(self._year)[-2:]}" 

120 return base_url, name 

121 

122 def _post_2010_tiger(self, prefix, shapefile_scope: str, suffix): 

123 # Special case for whatever reason the US Census decided. 

124 base_url = ( 

125 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/{suffix.upper()}" 

126 ) 

127 

128 if self._year == 2020 and suffix == "tabblock": 

129 base_url = f"{base_url}20" 

130 suffix = f"{suffix}20" 

131 

132 if self._year in [2020, 2021] and suffix == "puma": 

133 suffix = f"{suffix}10" 

134 

135 name = f"{prefix}_{self._year}_{shapefile_scope}_{suffix}" 

136 return base_url, name 

137 

138 def tiger_url(self, prefix, shapefile_scope, suffix) -> Tuple[str, str]: 

139 """Construct a URL for a TIGER file.""" 

140 if self._year == 2010 or self._year < 2008: 

141 return self._through_2010_tiger(prefix, shapefile_scope, suffix) 

142 elif 2008 <= self._year <= 2009: 

143 return self._2008_2009_tiger(prefix, shapefile_scope, suffix) 

144 else: 

145 return self._post_2010_tiger(prefix, shapefile_scope, suffix) 

146 

147 def _tiger(self, shapefile_scope: str, geography, crs, timeout: int): 

148 prefix, suffix = ("tl", geography) 

149 

150 base_url, name = self.tiger_url(prefix, shapefile_scope, suffix) 

151 

152 gdf = self._read_shapefile(name, base_url, crs, timeout=timeout) 

153 

154 # Pull off the extra two digits of year that get tacked 

155 # on in some cases. 

156 

157 def mapper(col: str) -> str: 

158 if col.endswith(("20", "10", "00")): 

159 return col[:-2] 

160 return col 

161 

162 gdf.rename(mapper, axis="columns", inplace=True) 

163 

164 if "STATEFP" not in gdf.columns: 

165 gdf["STATEFP"] = shapefile_scope 

166 

167 return gdf 

168 

169 # The summary level to use for each of the resolutions we 

170 # support. 

171 # See https://www2.census.gov/geo/tiger/GENZ2010/ReadMe.pdf 

172 _CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010 = { 

173 "state": "040", 

174 "county": "050", 

175 "cousub": "060", 

176 "tract": "140", 

177 "bg": "150", 

178 } 

179 

180 def _through_2010_cb( 

181 self, cartographic_scope: str, geography: str, resolution: str 

182 ): 

183 if geography not in self._CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010: 

184 raise MapException( 

185 "Don't know how to interpret geography '%s' for pre-2010 maps.", 

186 geography, 

187 ) 

188 

189 summary_level = self._CB_SUMMARY_LEVEL_BY_GEOGRAPHY_THROUGH_2010[geography] 

190 

191 name = f"gz_{self._year}_{cartographic_scope}_{summary_level}_00_{resolution}" 

192 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}" 

193 

194 return base_url, name 

195 

196 def _post_2010_cb(self, cartographic_scope: str, geography, resolution: str): 

197 # May need to revise when 2020 PUMA is published. 

198 if geography == "puma" and 2010 <= self._year < 2022: 

199 geography = "puma10" 

200 elif geography == "puma" and self._year >= 2022: 

201 geography = "puma20" 

202 

203 name = f"cb_{self._year}_{cartographic_scope}_{geography}_{resolution}" 

204 

205 # The shp subdirectory did not appear until 2014. 

206 if self._year >= 2014: 

207 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}/shp" 

208 else: 

209 base_url = f"https://www2.census.gov/geo/tiger/GENZ{self._year}" 

210 

211 return base_url, name 

212 

213 def _cartographic_bound( 

214 self, shapefile_scope, geography, resolution, crs, *, timeout: int 

215 ) -> gpd.GeoDataFrame: 

216 if self._year <= 2010: 

217 base_url, name = self._through_2010_cb( 

218 shapefile_scope, geography, resolution 

219 ) 

220 else: 

221 base_url, name = self._post_2010_cb(shapefile_scope, geography, resolution) 

222 

223 gdf = self._read_shapefile(name, base_url, crs, timeout=timeout) 

224 

225 # Some files on the server, like 

226 # https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_050_00_500k.zip 

227 # leave the 'FP' suffix of column names. 

228 gdf.rename( 

229 { 

230 "STATE": "STATEFP", 

231 "COUNTY": "COUNTYFP", 

232 "TRACT": "TRACTCE", 

233 "BLKGRP": "BLKGRPCE", 

234 }, 

235 axis="columns", 

236 inplace=True, 

237 ) 

238 

239 return gdf 

240 

241 def read_shapefile( 

242 self, shapefile_scope: str, geography: str, crs=None, *, timeout: int = 30 

243 ): 

244 """ 

245 Read the geometries of geographies. 

246 

247 This method reads maps suitable 

248 for use with geometric joins and queries of various types. If you are 

249 only interested in plotting maps, the 

250 :py:meth:`~ShapeReader.read_cb_shapefile` method may be more suitable. 

251 

252 The files are read from the US Census servers and cached locally. 

253 They are in most cases the same files you can download manually from 

254 https://www.census.gov/cgi-bin/geo/shapefiles/index.php. 

255 

256 Individual files the API may download follow a naming convention 

257 that has evolved a bit over time. So for example a 2010 block group file 

258 for New Jersey would be found at 

259 https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_34_bg10.zip 

260 whereas a similar file for 2020 would be at 

261 https://www2.census.gov/geo/tiger/TIGER2020/BG/tl_2020_34_bg.zip. 

262 

263 This method knows many of the subtle changes that have occurred over the years, 

264 so you should mostly not have to worry about them. It is unlikely it knows 

265 them all, so please submit an 

266 issue at https://github.com/vengroff/censusdis/issues if you find 

267 otherwise. 

268 

269 Once read, the files are cached locally so that when we reuse the same 

270 files we do not have to go back to the server. 

271 

272 Parameters 

273 ---------- 

274 shapefile_scope 

275 The geography that is covered by the entire shapefile. In some 

276 cases, this is a state, e.g. `NJ`. For cases where files 

277 are available for the entire country, the string `"us"` is typically 

278 used. In some rare cases, like for the Alaska Native Regional Corporations 

279 (``"anrc"``) geography, other strings like ``"02"`` are used. 

280 See the dowload links at 

281 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

282 if you need to debug issues with a given geography. 

283 geography 

284 The geography we want to download bounds for. Supported 

285 geometries are `"state'`, `"county"`, `"cousub"` (county subdivision), 

286 `"tract"`, and `"bg"` (block group). Other geometries as defined 

287 by the US Census may work, but have not been thoroughly tested. 

288 crs 

289 The crs to make the file to. If `None`, use the default 

290 crs of the shapefile. Setting this is useful if we plan 

291 to merge the resulting `GeoDataFrame` with another so we 

292 can make sure they use the same crs. 

293 timeout 

294 Time out limit (in seconds) for the remote call. 

295 

296 Returns 

297 ------- 

298 A `gpd.GeoDataFrame` containing the requested 

299 geometries. 

300 """ 

301 return self._tiger(shapefile_scope, geography, crs, timeout=timeout) 

302 

303 def read_cb_shapefile( 

304 self, 

305 shapefile_scope: str, 

306 geography: str, 

307 resolution: str = "500k", 

308 crs=None, 

309 *, 

310 timeout: int = 30, 

311 ) -> gpd.GeoDataFrame: 

312 """ 

313 Read the cartographic boundaries of a given geography. 

314 

315 These are smaller 

316 files suited for plotting maps, as compared to those returned by 

317 :py:meth:`~ShapeReader.read_shapefile`, which returns higher 

318 resolution geometries. 

319 

320 The files are read from the US Census servers and cached locally. 

321 They are in most cases the same files you can download manually from 

322 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

323 or similar URLs for other years. 

324 

325 Individual files the API may download follow a naming convention 

326 that has evolved a bit over time. So for example a 2010 census tract 

327 cartographic bounds file 

328 for New Jersey at 500,000:1 resolution would be found at 

329 https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_34_140_00_500k.zip 

330 whereas a similar file for 2020 would be at 

331 https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_34_tract_500k.zip 

332 

333 This method knows many of the subtle changes that have occurred over the years, 

334 so you should mostly not have to worry about them. It is unlikely it knows 

335 them all, so please submit an 

336 issue at https://github.com/vengroff/censusdis/issues if you find 

337 otherwise. 

338 

339 Once read, the files are cached locally so that when we reuse the same 

340 files we do not have to go back to the server. 

341 

342 Parameters 

343 ---------- 

344 shapefile_scope 

345 The geography that is covered by the entire shapefile. In some 

346 cases, this is a state, e.g. `NJ`. For cases where files 

347 are available for the entire country, the string `"us"` is typically 

348 used. In some rare cases, like for the Alaska Native Regional Corporations 

349 (``"anrc"``) geography, other strings like ``"02"`` are used. 

350 See the dowload links at 

351 https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

352 if you need to debug issues with a given geography. 

353 geography 

354 The geography we want to download bounds for. Supported 

355 geometries are `"state'`, `"county"`, `"cousub"` (county subdivision), 

356 `"tract"`, and `"bg"` (block group) 

357 resolution 

358 What resolution shapes should we use. Permitted options are 

359 `"500k"`, `"5m"`, and `"20m"` for 1:500,000, 1:5,000,000, and 

360 1:20,000,000 resolution respectively. Availability varies, but for 

361 most geographies `"500k"` is available even if others are not. 

362 crs 

363 The crs to make the file to. If `None`, use the default 

364 crs of the shapefile. Setting this is useful if we plan 

365 to merge the resulting `GeoDataFrame` with another so we 

366 can make sure they use the same crs. 

367 timeout 

368 Time out limit (in seconds) for the remote call. 

369 

370 Returns 

371 ------- 

372 A `gpd.GeoDataFrame` containing the boundaries of the requested 

373 geometries. 

374 """ 

375 return self._cartographic_bound( 

376 shapefile_scope, geography, resolution, crs, timeout=timeout 

377 ) 

378 

379 def try_cb_tiger_shapefile( 

380 self, 

381 shapefile_scope: str, 

382 geography: str, 

383 resolution: str = "500k", 

384 crs=None, 

385 *, 

386 timeout: int = 60, 

387 ) -> gpd.GeoDataFrame: 

388 """ 

389 Try to retrieve CB file. 

390 

391 Wraps read_cb_shapefile and read_shapefile. 

392 

393 If unable to find, attempts to find the full tiger file. 

394 

395 Returns 

396 ------- 

397 A `gpd.GeoDataFrame` containing the boundaries of the requested 

398 geometries. 

399 """ 

400 try: 

401 gdf = self._cartographic_bound( 

402 shapefile_scope, geography, resolution, crs, timeout=timeout 

403 ) 

404 except MapException as e: 

405 logger.debug("Exception loading cb file. Trying tiger instead.", e) 

406 gdf = self._tiger(shapefile_scope, geography, crs, timeout=timeout) 

407 

408 return gdf 

409 

410 def _auto_fetch_file(self, name: str, base_url: str, *, timeout: int): 

411 if not self._auto_fetch: 

412 return 

413 

414 self._fetch_file(name, base_url, timeout=timeout) 

415 

416 def _url_for_file(self, name: str) -> str: 

417 if name.startswith("cb_"): 

418 return f"https://www2.census.gov/geo/tiger/GENZ{self._year}/shp/{name}.zip" 

419 

420 if name.startswith("tl_"): 

421 suffix = name.split("_")[-1] 

422 

423 if self._year <= 2010: 

424 return ( 

425 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/" 

426 f"{suffix.upper()[:-2]}/{self._year}/{name}.zip" 

427 ) 

428 

429 return ( 

430 f"https://www2.census.gov/geo/tiger/TIGER{self._year}/" 

431 f"{suffix.upper()}/{name}.zip" 

432 ) 

433 

434 # This will not work, but it's the main download page where we 

435 # can start to look for what we want. 

436 return "https://www.census.gov/cgi-bin/geo/shapefiles/index.php" 

437 

438 def _fetch_file( 

439 self, 

440 name: str, 

441 base_url: str, 

442 *, 

443 timeout: int, 

444 ) -> None: 

445 dir_path = self._shapefile_root / name 

446 

447 if dir_path.is_dir(): 

448 # Does it have the .shp file? If not maybe something 

449 # random went wrong in the previous attempt, or someone 

450 # deleted some stuff by mistake. So delete it and 

451 # reload. 

452 shp_path = dir_path / f"{name}.shp" 

453 if shp_path.is_file(): 

454 # Looks like the shapefile is there. 

455 return 

456 

457 # No shapefile so remove the whole directory and 

458 # hope for the best when we recreate it. 

459 shutil.rmtree(dir_path) 

460 

461 # Make the directory 

462 dir_path.mkdir() 

463 

464 # We will put the zip file in the dir we just created. 

465 zip_path = dir_path / f"{name}.zip" 

466 

467 # Construct the URL to get the zip file. 

468 # url = self._url_for_file(name) 

469 zip_url = f"{base_url}/{name}.zip" 

470 

471 # Fetch the zip file and write it. 

472 response = requests.get( 

473 zip_url, 

474 timeout=timeout, 

475 cert=certificates.map_cert, 

476 verify=certificates.map_verify, 

477 ) 

478 

479 if response.status_code == 404: 

480 raise MapException( 

481 f"{zip_url} was not found. " 

482 "The Census Bureau may not publish the shapefile you are looking for for the given year. " 

483 "Or the file you are looking for may be from a year where a naming convention that censusdis " 

484 "does not recognize was used." 

485 ) 

486 

487 headers = response.headers 

488 content_type = headers.get("Content-Type", None) 

489 

490 if content_type != "application/zip": 

491 raise MapException( 

492 f"Expected content type application/zip' from {zip_url}, but got '{content_type}' instead." 

493 ) 

494 

495 with zip_path.open("wb") as file: 

496 file.write(response.content) 

497 

498 # Unzip the file and extract all contents. 

499 try: 

500 with ZipFile(zip_path) as zip_file: 

501 zip_file.extractall(dir_path) 

502 except BadZipFile as exc: 

503 raise MapException(f"Bad zip file retrieved from {zip_url}") from exc 

504 finally: 

505 # We don't need the zipfile anymore. 

506 zip_path.unlink() 

507 

508 

509def clip_to_states(gdf, gdf_bounds): 

510 """ 

511 Clip every geometry in a gdf to the state it belongs to, from the states in the state bounds. 

512 

513 We clip to state bounds so that we don't plot areas 

514 outside the state. Typically, this clips areas that 

515 extend out into the water in coastal areas so we don't 

516 get strange artifacts in the water in plots. 

517 

518 The way we tell what state an input geometry belongs to 

519 is by looking at the `STATEFP` column for that geometry's 

520 row in the input. 

521 

522 Parameters 

523 ---------- 

524 gdf 

525 The input geometries. 

526 gdf_bounds 

527 The state bounds. 

528 

529 Returns 

530 ------- 

531 The input geometries where each is clipped to the bounds 

532 of the state to which it belongs. 

533 """ 

534 return ( 

535 gdf.groupby(gdf.STATEFP) 

536 .apply(lambda s: gpd.clip(s, gdf_bounds[gdf_bounds.STATEFP == s.name])) 

537 .droplevel("STATEFP") 

538 ) 

539 

540 

541def _wrap_poly(poly: Union[Polygon, Point]): 

542 """ 

543 Move a polygon. 

544 

545 Used in shifting AK and HI geometries. 

546 """ 

547 if isinstance(poly, Polygon): 

548 x_coord, _ = poly.exterior.coords.xy 

549 elif isinstance(poly, Point): 

550 x_coord = [poly.x] 

551 else: 

552 # Not sure how to parse it, so leave it 

553 # where it is. 

554 logger.warning(f"Unrecognized type {type(poly)} can't be wrapped.") 

555 return poly 

556 

557 if x_coord[0] > 0: 

558 poly = shapely.affinity.translate(poly, xoff=-360.0, yoff=0.0) 

559 return poly 

560 

561 

562def _wrap_polys(polys): 

563 """ 

564 Move the polygons in a multi-polygon. 

565 

566 Used in shifting AK and HI geometries. 

567 """ 

568 # Just in case it's not a MultiPolygon 

569 if not isinstance(polys, MultiPolygon): 

570 return _wrap_poly(polys) 

571 wrapped_polys = [_wrap_poly(p) for p in polys.geoms] 

572 return MultiPolygon(wrapped_polys) 

573 

574 

575# Boxes that contain AK and HI after _wrap_polys has 

576# been applied to it. We use this to identify 

577# geometries that we want to relocate in relocate_ak_hi 

578# when we don't have a STATEFP or STATE column to help 

579# identify what is in AK or HI. 

580 

581_AK_MIN_X = -188.0 

582_AK_MIN_Y = 51.0 

583_AK_MAX_X = -129.0 

584_AK_MAX_Y = 72.0 

585 

586_AK_BOUNDS = Polygon( 

587 ( 

588 (_AK_MIN_X, _AK_MIN_Y), 

589 (_AK_MAX_X, _AK_MIN_Y), 

590 (_AK_MAX_X, _AK_MAX_Y), 

591 (_AK_MIN_X, _AK_MAX_Y), 

592 (_AK_MIN_X, _AK_MIN_Y), 

593 ) 

594) 

595 

596_HI_MIN_X = -179.0 

597_HI_MIN_Y = 18.0 

598_HI_MAX_X = -154.0 

599_HI_MAX_Y = 29.0 

600 

601_HI_BOUNDS = Polygon( 

602 ( 

603 (_HI_MIN_X, _HI_MIN_Y), 

604 (_HI_MAX_X, _HI_MIN_Y), 

605 (_HI_MAX_X, _HI_MAX_Y), 

606 (_HI_MIN_X, _HI_MAX_Y), 

607 (_HI_MIN_X, _HI_MIN_Y), 

608 ) 

609) 

610 

611_PR_MIN_X = -68.0 

612_PR_MIN_Y = 17.0 

613_PR_MAX_X = -65.0 

614_PR_MAX_Y = 19.0 

615 

616_PR_BOUNDS = Polygon( 

617 ( 

618 (_PR_MIN_X, _PR_MIN_Y), 

619 (_PR_MAX_X, _PR_MIN_Y), 

620 (_PR_MAX_X, _PR_MAX_Y), 

621 (_PR_MIN_X, _PR_MAX_Y), 

622 (_PR_MIN_X, _PR_MIN_Y), 

623 ) 

624) 

625 

626 

627def _relocate_ak(geo: BaseGeometry) -> BaseGeometry: 

628 """ 

629 Relocate a geometry that is already known to be in the AK bounding box. 

630 

631 Parameters 

632 ---------- 

633 geo 

634 The geometry. 

635 

636 Returns 

637 ------- 

638 The relocated geometry. 

639 """ 

640 ak_scale_x = 0.25 

641 ak_scale_y = 0.4 

642 ak_x = 33 

643 ak_y = -34 

644 ak_origin = (-149.9003, 61.2181) # Anchorage 

645 geo = shapely.affinity.scale( 

646 geo, xfact=ak_scale_x, yfact=ak_scale_y, origin=ak_origin 

647 ) 

648 geo = shapely.affinity.translate(geo, xoff=ak_x, yoff=ak_y) 

649 

650 return geo 

651 

652 

653def _relocate_hi(geo: BaseGeometry) -> BaseGeometry: 

654 """ 

655 Relocate a geometry that is already known to be in the HI bounding box. 

656 

657 Parameters 

658 ---------- 

659 geo 

660 The geometry. 

661 

662 Returns 

663 ------- 

664 The relocated geometry. 

665 """ 

666 hi_x = 50 

667 hi_y = 6 

668 

669 geo = shapely.affinity.translate(geo, xoff=hi_x, yoff=hi_y) 

670 

671 return geo 

672 

673 

674def _relocate_pr(geo: BaseGeometry) -> BaseGeometry: 

675 """ 

676 Relocate a geometry that is already known to be in the PR bounding box. 

677 

678 Parameters 

679 ---------- 

680 geo 

681 The geometry. 

682 

683 Returns 

684 ------- 

685 The relocated geometry. 

686 """ 

687 pr_x = -7 

688 pr_y = 8 

689 

690 geo = shapely.affinity.translate(geo, xoff=pr_x, yoff=pr_y) 

691 

692 return geo 

693 

694 

695def _relocate_parts_in_ak_hi_pr(geo: BaseGeometry) -> BaseGeometry: 

696 """ 

697 Relocate any sub-geometries that happen to fall in the AK or HI or PR bounding boxes. 

698 

699 If the geometry is a simple polygon, check if it intersects the 

700 bounding boxes of AK or HI and relocate if so. If it is a 

701 `MultiPolygon` then recurse in and relocate some 

702 contained geometries as appropriate. This way it can work on small 

703 polygons completely contained in the bounding box, or on larger 

704 multi-polygons like regions that may have some polygons in the bounding 

705 box and others outside it. 

706 

707 Parameters 

708 ---------- 

709 geo 

710 The geography. 

711 

712 Returns 

713 ------- 

714 The geography, possibly with some parts relocated. 

715 """ 

716 if isinstance(geo, MultiPolygon): 

717 relocated_geos = [_relocate_parts_in_ak_hi_pr(g) for g in geo.geoms] 

718 return MultiPolygon(relocated_geos) 

719 

720 # It is an individual polygon. So see if it is 

721 # in a box that should be relocated. 

722 

723 if geo.intersects(_AK_BOUNDS): 

724 geo = _relocate_ak(geo) 

725 elif geo.intersects(_HI_BOUNDS): 

726 geo = _relocate_hi(geo) 

727 elif geo.intersects(_PR_BOUNDS): 

728 geo = _relocate_pr(geo) 

729 

730 return geo 

731 

732 

733def _wrap_and_relocate_geos(geo: BaseGeometry): 

734 geo = _wrap_polys(geo) 

735 return _relocate_parts_in_ak_hi_pr(geo) 

736 

737 

738def _relocate_ak_hi_pr_group(group): 

739 """ 

740 Relocate a group of geometries. 

741 

742 They are relocated if they belong to AK, HI or PR, otherwise 

743 they are left alone. 

744 """ 

745 if group.name == AK: 

746 # Deal with the Aleutian islands wrapping at -180/180 longitude. 

747 group.geometry = group.geometry.apply(_wrap_polys) 

748 # Relocate 

749 group.geometry = group.geometry.apply(_relocate_ak) 

750 elif group.name == HI: 

751 group.geometry = group.geometry.apply(_relocate_hi) 

752 elif group.name == PR: 

753 group.geometry = group.geometry.apply(_relocate_pr) 

754 

755 return group 

756 

757 

758def relocate_ak_hi_pr(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: 

759 """ 

760 Relocate any geometry that is in Alaska or Hawaii for plotting purposes. 

761 

762 We first try an optimization. If there is a `STATEFP` 

763 column then we relocate rows where that column has a value of 

764 `AK`, `HI` or `PR`. If there is not a 

765 `STATEFP` column we check for a `STATE` column and do the same. If neither 

766 column exists then we dig down into the geometries themselves 

767 and relocate those that intersect bounding rectangles of the 

768 two states. 

769 

770 Note: the expectation is that the crs or the incoming geo data frame 

771 is EPSG:4269 or something that closely approximates it, in units of 

772 degrees of latitude and longitude. If this is not the case, results 

773 are unpredictable. 

774 

775 Parameters 

776 ---------- 

777 gdf 

778 the geo data frame to relocate. 

779 

780 Returns 

781 ------- 

782 a geo data frame with any geometry in AK or HI moved for plotting. 

783 """ 

784 if "STATEFP" in gdf.columns or "STATE" in gdf.columns: 

785 # There is a column idenfyig the state of each geometry 

786 # so use that to decide what to relocate. 

787 if "STATEFP" in gdf.columns: 

788 group_column = "STATEFP" 

789 else: 

790 group_column = "STATE" 

791 

792 gdf = ( 

793 gdf.groupby(gdf[group_column], group_keys=True, sort=False) 

794 .apply( 

795 _relocate_ak_hi_pr_group, 

796 include_groups=False, 

797 ) 

798 .reset_index(level=1, drop=True) 

799 .reset_index(drop=False) 

800 ) 

801 else: 

802 # There is no column indicating the state of each geometry. This 

803 # is often because the geometries span states. So we can't easily 

804 # relocate the two states, but we least wrap the Aleutian 

805 # islands if present and then relocate any geometries that are 

806 # in the bounding boxes of AK and HI. 

807 gdf = gdf.copy() 

808 gdf.geometry = gdf.geometry.map(_wrap_and_relocate_geos) 

809 

810 return gdf 

811 

812 

813__gdf_crs_bounds: Optional[gpd.GeoDataFrame] = None 

814"""The bounds of the all CRSs we might use in `plot_map`.""" 

815 

816 

817def _gdf_crs_bounds() -> gpd.GeoDataFrame: 

818 """ 

819 Construct a dataframe witn the bound of all the CRSs we might use in `plot_map`. 

820 

821 Returns 

822 ------- 

823 The dataframe. It is a singleton you should not modify. 

824 """ 

825 global __gdf_crs_bounds 

826 

827 if __gdf_crs_bounds is None: 

828 with importlib.resources.path( 

829 f"{__package__}.resources", "crs_bounds.geojson" 

830 ) as path: 

831 __gdf_crs_bounds = gpd.GeoDataFrame.from_file(path) 

832 

833 return __gdf_crs_bounds 

834 

835 

836def _closest_epsg( 

837 gdf: gpd.GeoDataFrame, 

838) -> int: 

839 """ 

840 Find the EPSG to use by choosing the one closest to the center of the bounds of the gdf. 

841 

842 We do this by looking at haversine distance between the 

843 centers. I am sure we could do this more efficiently 

844 than a linear scan using a data structure optimized for 

845 nearest neighbor queries on the surface of a sphere. 

846 But we only have about 100 candidates, so the linear scan is 

847 simple and not terrible. 

848 

849 Parameters 

850 ---------- 

851 gdf 

852 The gdf we want to plot. 

853 

854 Returns 

855 ------- 

856 The EPSG number. 

857 """ 

858 gdf_crs_bounds = _gdf_crs_bounds() 

859 

860 if gdf.crs != gdf_crs_bounds.crs: 

861 gdf = gdf.copy(deep=True).to_crs(gdf_crs_bounds.crs) 

862 

863 total_bounds = gdf.total_bounds 

864 

865 # Wrap the end of the Aleutian island chain. 

866 if total_bounds[0] > 0: 

867 total_bounds[0] = total_bounds[0] - 360.0 

868 if total_bounds[2] > 0: 

869 total_bounds[2] = total_bounds[2] - 360.0 

870 

871 lon, lat = ( 

872 (total_bounds[0] + total_bounds[2]) / 2, 

873 (total_bounds[1] + total_bounds[3]) / 2, 

874 ) 

875 

876 centers = gdf_crs_bounds.representative_point() 

877 

878 epsg = gdf_crs_bounds["epsg"].iloc[ 

879 centers.map(lambda center: haversine((center.y, center.x), (lat, lon))).argmin() 

880 ] 

881 

882 return epsg 

883 

884 

885def plot_map( 

886 gdf: gpd.GeoDataFrame, 

887 *args, 

888 with_background: bool = False, 

889 epsg: Optional[int] = None, # 3309, # 4269, 

890 geo_label: Optional[Union[str, pd.Series]] = None, 

891 geo_label_text_kwargs: Optional[Dict[str, Any]] = None, 

892 **kwargs, 

893) -> plt.Axes: 

894 """ 

895 Plot a map, optionally with a background. 

896 

897 Parameters 

898 ---------- 

899 gdf 

900 The geo data frame to plot 

901 args 

902 Optional args to matplotlib 

903 with_background 

904 Should we put in a background map from Open Street maps? 

905 epsg 

906 The EPSG to project to. Otherwise a suitable one for the 

907 geometry will be inferred. 

908 geo_label 

909 Name of a column in `gdf` that has labels for each geometry. 

910 geo_label_text_kwargs 

911 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}` 

912 kwargs 

913 keyword args to pass on to matplotlib 

914 

915 Returns 

916 ------- 

917 The ax of the resulting plot. 

918 """ 

919 if epsg is None: 

920 epsg = _closest_epsg(gdf) 

921 

922 gdf = gdf.to_crs(epsg=epsg) 

923 

924 ax = gdf.plot(*args, **kwargs) 

925 

926 if geo_label is not None: 

927 _add_plot_map_geo_labels(ax, gdf, geo_label, geo_label_text_kwargs) 

928 

929 ax.tick_params( 

930 left=False, 

931 right=False, 

932 bottom=False, 

933 labelleft=False, 

934 labelbottom=False, 

935 ) 

936 

937 if with_background: 

938 provider = cx.providers.OpenStreetMap.Mapnik 

939 

940 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider) 

941 

942 return ax 

943 

944 

945def _add_plot_map_geo_labels( 

946 ax: plt.Axes, 

947 gdf: gpd.GeoDataFrame, 

948 geo_label: Union[str, pd.Series], 

949 geo_label_text_kwargs: Optional[Dict[str, Any]], 

950) -> plt.Axes: 

951 """ 

952 Add labels to each geometry in a plot. 

953 

954 Internal utility function behind `plot_map` and `plot_us`. 

955 

956 Parameters 

957 ---------- 

958 ax 

959 The ax we already plotted on. 

960 gdf 

961 The geographic data frame we are plotting. 

962 geo_label 

963 The column in `gdf` that contains the labels to add to the plot. 

964 geo_label_text_kwargs 

965 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}` 

966 

967 Returns 

968 ------- 

969 `ax` 

970 """ 

971 if geo_label_text_kwargs is None: 

972 geo_label_text_kwargs = {} 

973 

974 default_text_kwargs = dict( 

975 ha="center", 

976 va="center", 

977 color="#333", 

978 size=9, 

979 path_effects=[pe.withStroke(linewidth=4, foreground="white")], 

980 ) 

981 

982 geo_label_text_kwargs = dict(**default_text_kwargs) | geo_label_text_kwargs 

983 

984 for idx, row in gdf.iterrows(): 

985 rep = row["geometry"].representative_point() 

986 

987 if isinstance(geo_label, str): 

988 name = row[geo_label] 

989 else: 

990 name = geo_label.loc[idx] 

991 

992 ax.text(rep.x, rep.y, name, **geo_label_text_kwargs) 

993 

994 return ax 

995 

996 

997def plot_us( 

998 gdf: gpd.GeoDataFrame, 

999 *args, 

1000 do_relocate_ak_hi_pr: bool = True, 

1001 with_background: bool = False, 

1002 epsg: int = 9311, 

1003 geo_label: Optional[Union[str, pd.Series]] = None, 

1004 geo_label_text_kwargs: Optional[Dict[str, Any]] = None, 

1005 **kwargs, 

1006) -> plt.Axes: 

1007 """ 

1008 Plot a map of the US with AK and HI relocated. 

1009 

1010 This function will move and scale AK and 

1011 HI so that they are plotted at the lower left of the other 48 states, 

1012 just below CA, AZ, and NM. 

1013 

1014 It also moves the Aleutian islands that are west of -180° longitude 

1015 so that they are plotted next to the rest of AK. Otherwise, they 

1016 tend to be plotted at longitudes just less than +180°, which 

1017 creates visual discontinuities. 

1018 

1019 Note: the expectation is that the crs or the incoming geo data frame 

1020 is EPSG:4269 or something that closely approximates it, in units of 

1021 degrees of latitude and longitude. If this is not the case, results 

1022 are unpredictable. 

1023 

1024 Parameters 

1025 ---------- 

1026 gdf 

1027 The geometries to be plotted. 

1028 do_relocate_ak_hi_pr 

1029 If `True` try to relocate AK, HI, and PR. Otherwise, still wrap 

1030 the Aleutian islands west of -180° longitude if present. 

1031 args 

1032 Args to pass to the plot. 

1033 with_background 

1034 Should we put in a background map from Open Street maps? 

1035 epsg: 

1036 The EPSG CRS to project to before plotting. Default is 9311, which 

1037 is equal area. See https://epsg.io/9311. 

1038 geo_label 

1039 Name of a column in `gdf` that has labels for each geometry. 

1040 geo_label_text_kwargs 

1041 kwargs to pass for label text, e.g. `{"color": "333". "size": 9}` 

1042 kwargs 

1043 Keyword args to pass to the plot. 

1044 

1045 Returns 

1046 ------- 

1047 ax of the plot. 

1048 """ 

1049 if gdf.crs != 4269: 

1050 logger.warning( 

1051 "Expected map to have crs epsg:4269, but got %s instead.", gdf.crs 

1052 ) 

1053 

1054 if do_relocate_ak_hi_pr: 

1055 gdf = relocate_ak_hi_pr(gdf) 

1056 else: 

1057 # At least wrap the Aleutian islands. 

1058 gdf.geometry = gdf.geometry.map(_wrap_polys) 

1059 

1060 gdf = gdf.to_crs(epsg=epsg) 

1061 

1062 ax = gdf.plot(*args, **kwargs) 

1063 

1064 if geo_label is not None: 

1065 _add_plot_map_geo_labels(ax, gdf, geo_label, geo_label_text_kwargs) 

1066 

1067 if with_background: 

1068 provider = cx.providers.OpenStreetMap.Mapnik 

1069 

1070 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider) 

1071 

1072 ax.tick_params( 

1073 left=False, 

1074 right=False, 

1075 bottom=False, 

1076 labelleft=False, 

1077 labelbottom=False, 

1078 ) 

1079 

1080 return ax 

1081 

1082 

1083def plot_us_boundary( 

1084 gdf: gpd.GeoDataFrame, 

1085 *args, 

1086 do_relocate_ak_hi_pr: bool = True, 

1087 with_background: bool = False, 

1088 epsg: int = 9311, 

1089 **kwargs, 

1090): 

1091 """ 

1092 Plot a map of boundaries the US with AK and HI relocated. 

1093 

1094 This function is very much like :py:func:`~plot_us` except 

1095 that it plots only the boundaries of geometries. 

1096 

1097 Note: the expectation is that the crs or the incoming geo data frame 

1098 is EPSG:4269 or something that closely approximates it, in units of 

1099 degrees of latitude and longitude. If this is not the case, results 

1100 are unpredictable. 

1101 

1102 Parameters 

1103 ---------- 

1104 gdf 

1105 The geometries to be plotted. 

1106 args 

1107 Args to pass to the plot. 

1108 do_relocate_ak_hi_pr 

1109 If `True` try to relocate AK, HI, and PR. Otherwise, still wrap 

1110 the Aleutian islands west of -180° longitude if present. 

1111 with_background 

1112 Should we put in a background map from Open Street maps? 

1113 epsg: 

1114 The EPSG CRS to project to before plotting. Default is 9311, which 

1115 is equal area. See https://epsg.io/9311. 

1116 kwargs 

1117 Keyword args to pass to the plot. 

1118 

1119 Returns 

1120 ------- 

1121 ax of the plot. 

1122 """ 

1123 if gdf.crs != 4269: 

1124 logger.warning( 

1125 "Expected map to have crs epsg:4269, but got %d instead.", gdf.crs 

1126 ) 

1127 

1128 if do_relocate_ak_hi_pr and with_background: 

1129 logger.warning( 

1130 "do_relocate_ak_hi_pr and with_background should not be used together. " 

1131 "Undesired results are likely." 

1132 ) 

1133 

1134 if do_relocate_ak_hi_pr: 

1135 gdf = relocate_ak_hi_pr(gdf) 

1136 else: 

1137 # At least wrap the Aleutian islands. 

1138 gdf = gdf.copy() 

1139 gdf.geometry = gdf.geometry.map(_wrap_polys) 

1140 

1141 gdf = gdf.to_crs(epsg=epsg) 

1142 

1143 ax = gdf.boundary.plot(*args, **kwargs) 

1144 

1145 if with_background: 

1146 provider = cx.providers.OpenStreetMap.Mapnik 

1147 

1148 cx.add_basemap(ax, crs=gdf.crs.to_string(), source=provider) 

1149 

1150 return ax 

1151 

1152 

1153def geographic_centroids(gdf: gpd.GeoDataFrame) -> gpd.GeoSeries: 

1154 """ 

1155 Compute the centroid of a geography. 

1156 

1157 We do this by projecting to epsg 3857 (https://epsg.io/3857), 

1158 computing the centroid, and then projecting back. This gives 

1159 a reasonable answer for most geometries and avoids warnings from 

1160 `GeoPandas`. 

1161 

1162 Parameters 

1163 ---------- 

1164 gdf 

1165 A geo data frame in any crs. 

1166 

1167 Returns 

1168 ------- 

1169 A geo data series of the centroids of all the geometries in 

1170 `gdf`. 

1171 """ 

1172 crs = gdf.crs 

1173 

1174 projected_centroids = gdf.geometry.to_crs(epsg=3857).geometry.centroid 

1175 centroids = projected_centroids.to_crs(crs) 

1176 

1177 return centroids 

1178 

1179 

1180def sjoin_mostly_contains( 

1181 gdf_large_geos: gpd.GeoDataFrame, 

1182 gdf_small_geos: gpd.GeoDataFrame, 

1183 large_suffix: str = "large", 

1184 small_suffix: str = "small", 

1185 area_threshold: float = 0.8, 

1186 area_epsg: int = 3857, 

1187): 

1188 """ 

1189 Spatial join based on fraction of contained area. 

1190 

1191 This function is designed to implement the common case 

1192 where we have a number of small geo areas like census 

1193 tracts or block groups in a large area like a CBSA. The 

1194 reason to use this instead of `gpd.GeoDataFrame.sjoin` 

1195 directly is that the smaller geos may not all be 

1196 strictly contained in the bounds of the larger geos. 

1197 And small geos outside the bounds of the larger one 

1198 may intersect along the boundary. So instead, this method 

1199 looks for small geos whose area is at least 80% 

1200 (or another chosen number) within the larger area, 

1201 

1202 Parameters 

1203 ---------- 

1204 gdf_large_geos 

1205 A geo data frame of one or more large geo areas like CBSAs. 

1206 gdf_small_geos 

1207 A geo data frame of smaller areas like census tracts. 

1208 large_suffix 

1209 Suffix to add to column names from the large side when 

1210 the same name appears in both. 

1211 small_suffix 

1212 Suffix to add to column names from the small side when 

1213 the same name appears in both. 

1214 area_threshold 

1215 The fraction of each smaller area that must be covered by 

1216 one of the large areas to be joined with it. 

1217 area_epsg 

1218 The CRS to use project to before doing area calculations. 

1219 Defaults to 3857. (https://epsg.io/3857), 

1220 

1221 Returns 

1222 ------- 

1223 Geo data frame of the spatially joined results. 

1224 """ 

1225 if gdf_large_geos.crs != gdf_small_geos.crs: 

1226 raise ValueError( 

1227 "Can only join geometries of the same crs. " 

1228 f"Got {gdf_large_geos.crs} and {gdf_small_geos.crs}" 

1229 ) 

1230 

1231 # Keep the original geos around in EPSG 3857 so 

1232 # we can check intersection areas. 

1233 gdf_large_geos[f"_original_large_geos_{area_epsg}"] = ( 

1234 gdf_large_geos.geometry.to_crs(epsg=area_epsg) 

1235 ) 

1236 gdf_small_geos[f"_original_small_geos_{area_epsg}"] = ( 

1237 gdf_small_geos.geometry.to_crs(epsg=area_epsg) 

1238 ) 

1239 

1240 # Do an intersection join. 

1241 gdf_intersection = gdf_small_geos.sjoin( 

1242 gdf_large_geos, 

1243 how="inner", 

1244 predicate="intersects", 

1245 lsuffix=small_suffix, 

1246 rsuffix=large_suffix, 

1247 ) 

1248 

1249 # Filter down to only those where the area of the intersection 

1250 # exceeds the threshold. 

1251 gdf_results = gdf_intersection[ 

1252 gdf_intersection[f"_original_small_geos_{area_epsg}"] 

1253 .intersection(gdf_intersection[f"_original_large_geos_{area_epsg}"]) 

1254 .area 

1255 >= area_threshold * gdf_intersection[f"_original_small_geos_{area_epsg}"].area 

1256 ] 

1257 

1258 gdf_results = gdf_results.drop( 

1259 [f"_original_small_geos_{area_epsg}", f"_original_large_geos_{area_epsg}"], 

1260 axis="columns", 

1261 ).copy() 

1262 

1263 gdf_results = gdf_results[ 

1264 [col for col in gdf_results.columns if col != "geometry"] + ["geometry"] 

1265 ] 

1266 

1267 return gdf_results