Coverage for censusdis/impl/us_census_shapefiles.py: 97%

119 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-04-03 05:39 +0000

1# Copyright (c) 2023 Darren Erik Vengroff 

2""" 

3Details of how U.S. Census shapefiles and their contents are named. 

4 

5Also includes utilities for managing downloaded shapefiles and 

6for water clipping. 

7""" 

8 

9from dataclasses import dataclass 

10from pathlib import Path 

11from typing import Dict, Callable, Tuple, Optional, List, Generator, Union 

12from logging import getLogger 

13 

14import geopandas as gpd 

15import pandas as pd 

16 

17from censusdis import CensusApiException, maps as cmap 

18from censusdis.impl.geometry import drop_slivers_from_gdf 

19from censusdis.impl.varsource.base import VintageType 

20 

21 

22logger = getLogger(__name__) 

23 

24 

25_GEO_QUERY_FROM_DATA_QUERY_INNER_GEO: Dict[ 

26 str, 

27 Callable[[int], Tuple[Optional[str], str, List[str], List[str]]], 

28] = { 

29 # innermost geo: ( shapefile_scope, shapefile_geo_name, df_on, gdf_on ) 

30 "us": lambda year: ("us", "nation", ["US"], ["US"]), 

31 "region": lambda year: ("us", "region", ["REGION"], ["REGIONCE"]), 

32 "division": lambda year: ("us", "division", ["DIVISION"], ["DIVISIONCE"]), 

33 "combined statistical area": lambda year: ( 

34 "us", 

35 "csa", 

36 ["COMBINED_STATISTICAL_AREA"], 

37 ["CSAFP"], 

38 ), 

39 "metropolitan statistical area/micropolitan statistical area": lambda year: ( 

40 "us", 

41 "cbsa", 

42 ["METROPOLITAN_STATISTICAL_AREA_MICROPOLITAN_STATISTICAL_AREA"], 

43 ["CBSAFP"], 

44 ), 

45 "state": lambda year: ("us", "state", ["STATE"], ["STATEFP"]), 

46 "consolidated city": lambda year: ( 

47 "us", 

48 "concity", 

49 ["STATE", "CONSOLIDATED_CITY"], 

50 ["STATEFP", "CONCTYFP"], 

51 ), 

52 "county": lambda year: ( 

53 "us", 

54 "county", 

55 ["STATE", "COUNTY"], 

56 ["STATEFP", "COUNTYFP"], 

57 ), 

58 "public use microdata area": lambda year: ( 

59 "us", 

60 "puma10" if year < 2020 else "puma20", 

61 ["STATE", "PUBLIC_USE_MICRODATA_AREA"], 

62 ["STATEFP", "PUMACE"] if year < 2020 else ["STATEFP20", "PUMACE20"], 

63 ), 

64 "congressional district": lambda year: ( 

65 "us", 

66 _congressional_district_from_year(year), 

67 ["STATE", "CONGRESSIONAL_DISTRICT"], 

68 ["STATEFP", f"{_congressional_district_from_year(year).upper()}FP"], 

69 ), 

70 "zip code tabulation area": lambda year: ( 

71 "us", 

72 "zcta520" if year >= 2020 else "zcta510", 

73 ["ZIP_CODE_TABULATION_AREA"], 

74 ["ZCTA5CE10" if year < 2020 else "ZCTA5CE20" if year == 2020 else "ZCTA5CE"], 

75 ), 

76 "american indian area/alaska native area/hawaiian home land": lambda year: ( 

77 "us", 

78 "aiannh", 

79 ["AMERICAN_INDIAN_AREA_ALASKA_NATIVE_AREA_HAWAIIAN_HOME_LAND"], 

80 ["AIANNHCE"], 

81 ), 

82 "new england city and town area": lambda year: ( 

83 "us", 

84 "necta", 

85 ["NEW_ENGLAND_CITY_AND_TOWN_AREA"], 

86 ["NECTAFP"], 

87 ), 

88 # For these, the shapefiles are at the state level, so `None` 

89 # indicates that we have to fill it in based on the geometry 

90 # being queried. 

91 "county subdivision": lambda year: ( 

92 None, 

93 "cousub", 

94 ["STATE", "COUNTY_SUBDIVISION"], 

95 ["STATEFP", "COUSUB" if year == 2010 else "COUSUBFP"], 

96 ), 

97 "place": lambda year: (None, "place", ["STATE", "PLACE"], ["STATEFP", "PLACEFP"]), 

98 "tract": lambda year: ( 

99 None, 

100 "tract", 

101 ["STATE", "COUNTY", "TRACT"], 

102 ["STATEFP", "COUNTYFP", "TRACTCE"], 

103 ), 

104 "block group": lambda year: ( 

105 None, 

106 "bg", 

107 ["STATE", "COUNTY", "TRACT", "BLOCK_GROUP"], 

108 ["STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE"], 

109 ), 

110 "block": lambda year: ( 

111 None, 

112 "tabblock", 

113 ["STATE", "COUNTY", "TRACT", "BLOCK"], 

114 ["STATEFP", "COUNTYFP", "TRACTCE", "BLOCKCE"], 

115 ), 

116 "school district (unified)": lambda year: ( 

117 None, 

118 "unsd", 

119 ["STATE", "SCHOOL_DISTRICT_UNIFIED"], 

120 ["STATEFP", "UNSDLEA"], 

121 ), 

122 "school district (elementary)": lambda year: ( 

123 # CB files from 2016 on exist with the "elsd" name and "ELSDLEA" column. 

124 # Earlier tiger files use the name "sda" and "ESDLEA" column. 

125 None, 

126 "elsd" if year >= 2016 else "sde", 

127 ["STATE", "SCHOOL_DISTRICT_ELEMENTARY"], 

128 ["STATEFP", "ELSDLEA" if year >= 2016 else "ESDLEA"], 

129 ), 

130 "school district (secondary)": lambda year: ( 

131 # Similar scenario to school district (secondary). 

132 None, 

133 "scsd" if year >= 2016 else "sde", 

134 ["STATE", "SCHOOL_DISTRICT_SECONDARY"], 

135 ["STATEFP", "SCSDLEA" if year >= 2016 else "SSDLEA"], 

136 ), 

137 "state legislative district (upper chamber)": lambda year: ( 

138 None, 

139 "sldu", 

140 ["STATE", "STATE_LEGISLATIVE_DISTRICT_UPPER_CHAMBER"], 

141 ["STATEFP", "SLDUST"], 

142 ), 

143 "state legislative district (lower chamber)": lambda year: ( 

144 None, 

145 "sldl", 

146 ["STATE", "STATE_LEGISLATIVE_DISTRICT_LOWER_CHAMBER"], 

147 ["STATEFP", "SLDLST"], 

148 ), 

149 "voting district": lambda year: ( 

150 None, 

151 "vtd", 

152 ["STATE", "COUNTY", "VOTING_DISTRICT"], 

153 ( 

154 ["STATEFP20", "COUNTYFP20", "VTDST20"] 

155 if year >= 2020 

156 else ["STATEFP10", "COUNTYFP10", "VTDST10"] 

157 ), 

158 ), 

159 # This one could be a little dangerous if subminor civil 

160 # divisions exist in states and are not mapped as subbarios. 

161 # It appears the only example of this is the ESTATE in the 

162 # USVI. So we are going to punt for the moment and deal with 

163 # it at some future time, probably by adding an arg to the 

164 # lambdas to take in the bound params so we can look at them. 

165 # 

166 # Noted in https://github.com/vengroff/censusdis/issues/223. 

167 "subminor civil division": lambda year: ( 

168 None, 

169 "subbarrio", 

170 ["STATE", "COUNTY", "COUNTY_SUBDIVISION", "SUBMINOR_CIVIL_DIVISION"], 

171 ["STATEFP", "COUNTYFP", "COUSUBFP", "SUBMCDFP"], 

172 ), 

173 "alaska native regional corporation": lambda year: ( 

174 None, 

175 "anrc", 

176 ["ALASKA_NATIVE_REGIONAL_CORPORATION"], 

177 ["ANRCFP"], 

178 ), 

179} 

180""" 

181Helper map for the _with_geometry case. 

182 

183A map from the innermost level of a geometry specification 

184to the arguments we need to pass to `get_cb_shapefile` 

185to get the right shapefile for the geography and the columns 

186we need to join the data and shapefile on. 

187 

188The values are functions of the year. Once we locate one, we 

189call it with the year as an argument to get the final value. 

190This is necessary for e.g. congressional districts, which have 

191names that change every two years. It also turns out to be very 

192helpful in dealing with little quirks in the data like changing 

193the name of the COUSUB/COSUBFP column in just one year. 

194 

195We should add everything in 

196https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html 

197to this map. 

198""" 

199 

200 

201def geo_query_from_data_query_inner_geo( 

202 year: int, geo_level: str 

203) -> Tuple[Optional[str], str, List[str], List[str]]: 

204 """ 

205 Map lookup and call the for the give year to produce the result. 

206 

207 Parameters 

208 ---------- 

209 year 

210 The year to pass to callable values. 

211 geo_level 

212 The key to look up. 

213 

214 Returns 

215 ------- 

216 A tuple of results. 

217 """ 

218 if geo_level not in _GEO_QUERY_FROM_DATA_QUERY_INNER_GEO: 

219 raise CensusApiException( 

220 "The with_geometry=True flag is only allowed if the " 

221 f"geometry for the data to be loaded ('{geo_level}') is one of " 

222 f"{list(_GEO_QUERY_FROM_DATA_QUERY_INNER_GEO.keys())}." 

223 ) 

224 

225 return _GEO_QUERY_FROM_DATA_QUERY_INNER_GEO[geo_level](year) 

226 

227 

228def _geo_query_from_data_query_inner_geo_items( 

229 year: int, 

230) -> Generator[Tuple[Optional[str], str, List[str], List[str]], None, None]: 

231 """Generate the items in `GEO_QUERY_FROM_DATA_QUERY_INNER_GEO`.""" 

232 for geo_level in _GEO_QUERY_FROM_DATA_QUERY_INNER_GEO: 

233 yield geo_level, geo_query_from_data_query_inner_geo(year, geo_level) 

234 

235 

236def _congressional_district_from_year(year: int) -> str: 

237 """Construct the short form of the congressional district used in a given year.""" 

238 # See the files in https://www2.census.gov/geo/tiger/GENZ2020/shp/ 

239 # and similar. The interesting ones are of the form 

240 # 

241 # cb_20YY_us_cdCCC_500k.zip 

242 # 

243 # where YY is the year and CCC is the congressional district 

244 # used. 

245 # 

246 # The mappings are not exactly at two year intervals as we would expect. 

247 if year == 2020 or year == 2021: 

248 # For some reason they did not update to cd117 for these years. Pandemic? 

249 return "cd116" 

250 

251 # Regular year rule. 

252 congress = 104 + (year - 1994) // 2 

253 return f"cd{congress}" 

254 

255 

256def infer_geo_level(year: Optional[int], df_data: pd.DataFrame) -> str: 

257 """ 

258 Infer the geography level based on columns names. 

259 

260 Parameters 

261 ---------- 

262 year 

263 The vintage of the data. `None` to infer from the data. 

264 df_data 

265 A dataframe of variables with one or more columns that 

266 can be used to infer what geometry level the rows represent. 

267 

268 For example, if the column `"STATE"` exists, we could infer that 

269 the data in on a state by state basis. But if there are 

270 columns for both `"STATE"` and `"COUNTY"`, the data is probably 

271 at the county level. 

272 

273 If, on the other hand, there is a `"COUNTY" column but not a 

274 `"STATE"` column, then there is some ambiguity. The data 

275 probably corresponds to counties, but the same county ID can 

276 exist in multiple states, so we will raise a 

277 :py:class:`~CensusApiException` with an error message expalining 

278 the situation. 

279 

280 If there is no match, we will also raise an exception. Again we 

281 do this, rather than for example, returning `None`, so that we 

282 can provide an informative error message about the likely cause 

283 and what to do about it. 

284 

285 This function is not often called directly, but rather from 

286 :py:func:`~add_inferred_geography`, which infers the geography 

287 level and then adds a `geometry` column containing the appropriate 

288 geography for each row. 

289 

290 Returns 

291 ------- 

292 The name of the geography level. 

293 """ 

294 match_key = None 

295 match_on_len = 0 

296 partial_match_keys = [] 

297 

298 for k, (_, _, df_on, _) in _geo_query_from_data_query_inner_geo_items(year): 

299 if all(col in df_data.columns for col in df_on): 

300 # Full match. We want the longest full match 

301 # we find. 

302 if match_key is None or len(df_on) > match_on_len: 

303 match_key = k 

304 elif df_on[-1] in df_data.columns: 

305 # Partial match. This could result in us 

306 # not getting what we expect. Like if we 

307 # have STATE and TRACT, but not COUNTY, we will 

308 # get a partial match on [STATE, COUNTY. TRACT] 

309 # and a full match on [STATE]. We probably did 

310 # not mean to match [STATE], we just lost the 

311 # COUNTY column somewhere along the way. 

312 partial_match_keys.append(k) 

313 

314 if match_key is None: 

315 raise CensusApiException( 

316 f"Unable to infer geometry. Was not able to locate any of the " 

317 "known sets of columns " 

318 f"{tuple(df_on for _, (_, _, df_on, _) in _geo_query_from_data_query_inner_geo_items(year))} " 

319 f"in the columns {list(df_data.columns)}." 

320 ) 

321 

322 if partial_match_keys: 

323 raise CensusApiException( 

324 f"Unable to infer geometry. Geometry matched {match_key} on columns " 

325 f"{geo_query_from_data_query_inner_geo(year, match_key)[2]} " 

326 "but also partially matched one or more candidates " 

327 f"{tuple(geo_query_from_data_query_inner_geo(year, k)[2] for k in partial_match_keys)}. " 

328 "Partial matches are usually unintended. Either add columns to allow a " 

329 "full match or rename some columns to prevent the undesired partial match." 

330 ) 

331 

332 return match_key 

333 

334 

335def add_geography( 

336 df_data: pd.DataFrame, 

337 year: Optional[VintageType], 

338 shapefile_scope: str, 

339 geo_level: str, 

340 with_geometry_columns: bool = False, 

341 tiger_shapefiles_only: bool = False, 

342) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 

343 """ 

344 Add geography to data. 

345 

346 Parameters 

347 ---------- 

348 df_data 

349 The data we downloaded from the census API 

350 year 

351 The year for which to fetch geometries. We need this 

352 because they change over time. If `None`, look for a 

353 `'YEAR'` column in `df_data` and possibly add different 

354 geometries for different years as needed. 

355 shapefile_scope 

356 The scope of the shapefile. This is typically either a state 

357 such as `NJ` or the string `"us"`. 

358 geo_level 

359 The geography level we want to add. 

360 with_geometry_columns 

361 If `True` keep all the additional columns that come with shapefiles 

362 downloaded to get geometry information. 

363 tiger_shapefiles_only 

364 If `True` only look for TIGER shapefiles. If `False`, first look 

365 for CB shapefiles 

366 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html), 

367 which are more suitable for plotting maps, then fall back on the full 

368 TIGER files 

369 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) 

370 only if CB is not available. This is mainly set to `True` only 

371 when `with_geometry_columns` is also set to `True`. The reason 

372 is that the additional columns in the shapefiles are different 

373 in the CB files than in the TIGER files. 

374 

375 Returns 

376 ------- 

377 A GeoDataFrame with the original data and an 

378 added geometry column for each row. 

379 """ 

380 ( 

381 query_shapefile_scope, 

382 shapefile_geo_level, 

383 df_on, 

384 gdf_on, 

385 ) = geo_query_from_data_query_inner_geo(year, geo_level) 

386 

387 # If the query spec has a hard-coded value then we use it. 

388 if query_shapefile_scope is not None: 

389 shapefile_scope = query_shapefile_scope 

390 

391 def individual_shapefile(sub_scope: str, query_year: int) -> gpd.GeoDataFrame: 

392 """Read the relevant shapefile and add a YEAR column to it.""" 

393 resolution = "5m" if shapefile_geo_level == "nation" else "500k" 

394 

395 try: 

396 if tiger_shapefiles_only: 

397 gdf = __shapefile_reader(query_year).read_shapefile( 

398 sub_scope, shapefile_geo_level 

399 ) 

400 else: 

401 gdf = __shapefile_reader(query_year).try_cb_tiger_shapefile( 

402 sub_scope, shapefile_geo_level, resolution=resolution 

403 ) 

404 gdf["YEAR"] = query_year 

405 return gdf 

406 except cmap.MapException as ex: 

407 # If there are some years where we can't find a shapefile, 

408 # skip over it and those rows will not have geometry in the 

409 # final result. 

410 logger.info( 

411 "Unable to load shapefile for scope %s for year %d", 

412 sub_scope, 

413 query_year, 

414 exc_info=ex, 

415 ) 

416 return gpd.GeoDataFrame() 

417 

418 # If there is a single defined year then we can load the single 

419 # shapefile. If not, then we have to load multiple shapefiles, 

420 # one per year, and concatenate them. 

421 # 

422 # Whether there is a single or multiple years, there could also 

423 # me multiple scopes, e.g. states, for which we have to download 

424 # shapefiles. If so, by the time we get here, they are encoded in 

425 # one string with comma separators. 

426 if isinstance(year, int): 

427 gdf_shapefile = pd.concat( 

428 individual_shapefile(sub_scope, year) 

429 for sub_scope in shapefile_scope.split(",") 

430 ) 

431 merge_gdf_on = gdf_on 

432 else: 

433 gdf_shapefile = pd.concat( 

434 individual_shapefile(sub_scope, unique_year) 

435 for unique_year in df_data["YEAR"].unique() 

436 for sub_scope in shapefile_scope.split(",") 

437 ) 

438 

439 merge_gdf_on = ["YEAR"] + gdf_on 

440 df_on = ["YEAR"] + df_on 

441 

442 if len(gdf_shapefile.index) == 0: 

443 # None of the years matched, so we add None for geometry to all. 

444 gdf = gpd.GeoDataFrame(df_data, copy=True) 

445 gdf.set_geometry([None for _ in gdf.index], inplace=True) 

446 return gdf 

447 

448 if "TRACT" in df_data.columns: 

449 df_data["TRACT"] = df_data["TRACT"].str.ljust( 

450 6, "0" 

451 ) # Pre 2010 data has inconsistencies in TRACT string length 

452 

453 # The nation-level shapefile is missing a column 

454 # to merge on. 

455 if shapefile_geo_level == "nation": 

456 gdf_shapefile["US"] = "1" 

457 

458 final_data_columns = df_data.columns.tolist() 

459 

460 # Strip out extra columns unless told to keep them. 

461 if with_geometry_columns: 

462 final_data_columns = final_data_columns + [ 

463 col 

464 for col in gdf_shapefile.columns 

465 if col not in merge_gdf_on 

466 and col not in final_data_columns 

467 and (col != "geometry") 

468 ] 

469 else: 

470 gdf_shapefile = gdf_shapefile[merge_gdf_on + ["geometry"]] 

471 

472 gdf_data = gdf_shapefile.merge( 

473 # There is a `NAME` column in the shapefile, so we use suffixes 

474 # to avoid confusing it with the `NAME` that might be in df_data. 

475 # Similar for any other conflict in names that are not being merged on. 

476 df_data, 

477 how="right", 

478 left_on=merge_gdf_on, 

479 right_on=df_on, 

480 suffixes=("_shapefile", ""), 

481 ) 

482 

483 # Get the columns we want in a reasonable order matching 

484 # how they are in the data, with geometry at the end. 

485 gdf_data = gdf_data[final_data_columns + ["geometry"]] 

486 

487 # Rearrange columns so geometry is at the end. 

488 gdf_data = gdf_data[ 

489 [col for col in gdf_data.columns if col != "geometry"] + ["geometry"] 

490 ] 

491 

492 return gdf_data 

493 

494 

495@dataclass 

496class _ShapefileRoot: 

497 """A private class to stash the root we will use to cache shapefiles locally.""" 

498 

499 shapefile_root: Optional[Path] = None 

500 

501 

502__shapefile_root = _ShapefileRoot() 

503__shapefile_readers: Dict[int, cmap.ShapeReader] = {} 

504 

505 

506def set_shapefile_path(shapefile_path: Union[Path, None]) -> None: 

507 """ 

508 Set the path to the directory to cache shapefiles. 

509 

510 This is where we will cache shapefiles downloaded when 

511 `with_geometry=True` is passed to :py:func:`~download`. 

512 

513 Parameters 

514 ---------- 

515 shapefile_path 

516 The path to use for caching shapefiles. 

517 """ 

518 __shapefile_root.shapefile_root = shapefile_path 

519 

520 

521def get_shapefile_path() -> Union[Path, None]: 

522 """ 

523 Get the path to the directory to cache shapefiles. 

524 

525 This is where we will cache shapefiles downloaded when 

526 `with_geometry=True` is passed to :py:func:`~download`. 

527 

528 Returns 

529 ------- 

530 The path to use for caching shapefiles. 

531 """ 

532 return __shapefile_root.shapefile_root 

533 

534 

535def __shapefile_reader(year: int): 

536 reader = __shapefile_readers.get(year, None) 

537 

538 if reader is None: 

539 reader = cmap.ShapeReader( 

540 __shapefile_root.shapefile_root, 

541 year, 

542 ) 

543 

544 __shapefile_readers[year] = reader 

545 

546 return reader 

547 

548 

549def _identify_counties(gdf_geo: gpd.GeoDataFrame, year: int): 

550 """ 

551 Take a geodataframe and identify which US counties the supplied geography overlaps. 

552 

553 Parameters 

554 ---------- 

555 gdf_geo 

556 A GeoDataFrame containing polygons within the United States 

557 year 

558 The year for which to fetch geometries. We need this 

559 because they change over time. 

560 

561 Returns 

562 ------- 

563 A list of five digit county FIPS codes. 

564 """ 

565 # Some dataframes will contain the county column already 

566 if "STATE" in gdf_geo and "COUNTY" in gdf_geo: 

567 fips_codes = gdf_geo["STATE"] + gdf_geo["COUNTY"] 

568 

569 return fips_codes.unique().tolist() 

570 # Otherwise, we load all the US counties and perform an overlap operation 

571 else: 

572 reader = __shapefile_readers.get(year) 

573 us_counties = reader.read_cb_shapefile("us", "county") 

574 us_counties["FIPS"] = us_counties["STATEFP"] + us_counties["COUNTYFP"] 

575 

576 county_overlap = us_counties.overlay(gdf_geo, keep_geom_type=False) 

577 fips_codes = county_overlap["STATEFP"] + county_overlap["COUNTYFP"] 

578 

579 return fips_codes.unique().tolist() 

580 

581 

582def clip_water( 

583 gdf_geo: gpd.GeoDataFrame, 

584 year: int, 

585 minimum_area_sq_meters: int = 10000, 

586 sliver_threshold=0.01, 

587): 

588 """ 

589 Remove water from input `GeoDataFrame`. 

590 

591 Parameters 

592 ---------- 

593 gdf_geo 

594 The GeoDataFrame from which we want to remove water 

595 year 

596 The year for which to fetch geometries. We need this 

597 because they change over time. 

598 minimum_area_sq_meters 

599 The minimimum size of a water area to be removed 

600 

601 Returns 

602 ------- 

603 A GeoDataFrame with the water areas larger than 

604 the specified threshold removed. 

605 """ 

606 counties = _identify_counties(gdf_geo, year) 

607 gdf_water = _retrieve_water(counties, year) 

608 

609 gdf_without_water = _water_difference(gdf_geo, gdf_water, minimum_area_sq_meters) 

610 

611 original_crs = gdf_without_water.crs 

612 gdf_without_water = drop_slivers_from_gdf( 

613 gdf_without_water.to_crs(epsg=3857), threshold=sliver_threshold 

614 ).to_crs(original_crs) 

615 

616 return gdf_without_water 

617 

618 

619def _retrieve_water(county_fips_codes: list[str], year: int): 

620 """ 

621 Load `AREAWATER` files from tiger for specified counties. 

622 

623 Parameters 

624 ---------- 

625 county_FIPS_codes 

626 A list of five digit county FIPS codes 

627 

628 Returns 

629 ------- 

630 A GeoDataFrame containing the census defined water in the supplied counties 

631 """ 

632 reader = __shapefile_readers.get(year) 

633 

634 gdf_water = pd.concat( 

635 reader.read_shapefile(shapefile_scope=county, geography="areawater") 

636 for county in county_fips_codes 

637 ) 

638 # Geo pandas has no concat method, so we convert from a pandas df 

639 gdf_water = gpd.GeoDataFrame(gdf_water) 

640 return gdf_water 

641 

642 

643def _water_difference( 

644 gdf_geo: gpd.GeoDataFrame, gdf_water: gpd.GeoDataFrame, minimum_area_sq_meters: int 

645): 

646 """ 

647 Remove water polygons exceeding minimum size from supplied `GeoDataFrame`. 

648 

649 Parameters 

650 ---------- 

651 gdf_geo 

652 A GeoDataFrame containing polygons within the United States 

653 gdf_water 

654 A GeoDataFrame containing census AREAWATER polygons 

655 minimum_area_sq_meters 

656 The smallest water polygon to be removed, specified in square meters 

657 

658 Returns 

659 ------- 

660 A version of gdf_geo with the water areas removed 

661 """ 

662 # Combining polygons speeds up the overlay operation 

663 geo_combined_water = gdf_water[ 

664 gdf_water["AWATER"] >= minimum_area_sq_meters 

665 ].union_all() 

666 gdf_combined_water = gpd.GeoDataFrame(geometry=[geo_combined_water]) 

667 gdf_combined_water = gdf_combined_water.set_crs(gdf_water.crs) 

668 

669 return gdf_geo.overlay( 

670 gdf_combined_water, 

671 "difference", 

672 keep_geom_type=False, 

673 )