Coverage for censusdis/cli/yamlspec.py: 91%

306 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-04-03 05:39 +0000

1# Copyright (c) 2023 Darren Erik Vengroff 

2"""Classes that are loaded from YAML config files for the CLI.""" 

3from abc import ABC 

4import itertools 

5from importlib import import_module 

6from pathlib import Path 

7from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, ClassVar 

8 

9import geopandas as gpd 

10import pandas as pd 

11import yaml 

12 

13from matplotlib.ticker import StrMethodFormatter 

14 

15import censusdis.data as ced 

16import censusdis.maps as cem 

17import censusdis.datasets 

18import censusdis.states 

19from censusdis.geography import InSpecType 

20from censusdis.impl.varsource.base import VintageType 

21 

22 

23def _class_constructor(clazz: ClassVar): 

24 def constructor( 

25 loader: yaml.SafeLoader, node: yaml.nodes.MappingNode 

26 ) -> VariableSpec: 

27 """Construct a new object of the given class.""" 

28 kwargs = loader.construct_mapping(node, deep=True) 

29 return clazz(**kwargs) 

30 

31 return constructor 

32 

33 

34class VariableSpec(ABC): 

35 """ 

36 Abstract ase class for specification of variables to download from the U.S. Census API. 

37 

38 Parameters 

39 ---------- 

40 denominator 

41 The denominator to divide by when constructing fractional variables. 

42 If `False` then no fractional variables are added. If the name of a 

43 variable, that variable will be downloaded and used as a denominator 

44 to compute fractional versions of all of the other variables. If `True` 

45 then the denominator will be computed as the sum of all the other 

46 variables. 

47 frac_prefix 

48 The prefix to prepend to fractional variables. If `None` a default 

49 prefix of `'frac_'` is used. 

50 """ 

51 

52 def __init__( 

53 self, 

54 *, 

55 denominator: Union[str, bool] = False, 

56 frac_prefix: Optional[str] = None, 

57 frac_not: bool = False, 

58 ): 

59 self._denominator = denominator 

60 

61 if frac_prefix is None: 

62 frac_prefix = "frac_" 

63 

64 self._frac_prefix = frac_prefix 

65 

66 self._frac_not = frac_not 

67 

68 @property 

69 def denominator(self) -> Union[str, bool]: 

70 """The denominator to divide by when constructing fractional variables.""" 

71 return self._denominator 

72 

73 @property 

74 def frac_prefix(self) -> str: 

75 """The prefix to prepend to fractional variables.""" 

76 return self._frac_prefix 

77 

78 @property 

79 def frac_not(self) -> str: 

80 """Should we return 1 - fraction instead of fraction.""" 

81 return self._frac_not 

82 

83 def variables_to_download(self) -> List[str]: 

84 """Return a list of the variables that need to be downloaded from the U.S. Census API.""" 

85 if isinstance(self._denominator, str): 

86 return [self._denominator] 

87 

88 return [] 

89 

90 def groups_to_download(self) -> List[Tuple[str, bool]]: 

91 """ 

92 Return the names of groups of variables that need to be downloaded from the U.S. Census API. 

93 

94 Returns 

95 ------- 

96 The names of groups to download. 

97 """ 

98 return [] 

99 

100 def synthesize(self, df_downloaded: Union[pd.DataFrame, gpd.GeoDataFrame]) -> None: 

101 """ 

102 Post-process after downloading to compute variables like fractional variables are constructed. 

103 

104 Parameters 

105 ---------- 

106 df_downloaded 

107 A data frame of variables that were downloaded. Any systhesized variables 

108 are added as new columns. 

109 

110 Returns 

111 ------- 

112 None. Any additions are made in-place in `df_downloaded`. 

113 """ 

114 return df_downloaded 

115 

116 def download( 

117 self, 

118 dataset: str, 

119 vintage: VintageType, 

120 *, 

121 set_to_nan: Union[bool, Iterable[int]] = True, 

122 skip_annotations: bool = True, 

123 with_geometry: bool = False, 

124 contained_within: Optional[ced.ContainedWithin] = None, 

125 remove_water: bool = False, 

126 api_key: Optional[str] = None, 

127 row_keys: Optional[Union[str, Iterable[str]]] = None, 

128 **kwargs: InSpecType, 

129 ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 

130 """ 

131 Download the variables we need from the U.S. Census API. 

132 

133 Most of the optional parameters here mirror those in 

134 :py:func:`~ced.download`. 

135 

136 Parameters 

137 ---------- 

138 dataset 

139 The dataset to download from. For example `"acs/acs5"`, 

140 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are 

141 symbolic names for datasets, like `ACS5` for `"acs/acs5" 

142 in :py:module:`censusdis.datasets`. 

143 vintage 

144 The vintage to download data for. For most data sets this is 

145 an integer year, for example, `2020`. But for 

146 a timeseries data set, pass the string `'timeseries'`. 

147 set_to_nan 

148 A list of values that should be set to NaN. Normally these are special 

149 values that the U.S. Census API sometimes returns. If `True`, then all 

150 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced. 

151 If `False`, no replacements will be made. 

152 skip_annotations 

153 If `True` try to filter out `group` or `leaves_of_group` variables that are 

154 annotations rather than actual values. See :py:meth:`VariableCache.group_variables` 

155 for more details. Variable names passed in `download_variables` are not 

156 affected by this flag. 

157 with_geometry 

158 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row 

159 will have a geometry that is a cartographic boundary suitable for platting 

160 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

161 for details of the shapefiles that will be downloaded on your behalf to 

162 generate these boundaries. 

163 contained_within 

164 An optional :py:class:`~ced.ContainedWithin` if we want to download 

165 geometries contained within others. 

166 remove_water 

167 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and 

168 remove water areas from returned geometry. 

169 api_key 

170 An optional API key. If you don't have or don't use a key, the number 

171 of calls you can make will be limited to 500 per day. 

172 row_keys 

173 An optional set of identifier keys to help merge together requests for more than the census API limit of 

174 50 variables per query. These keys are useful for census datasets such as the Current Population Survey 

175 where the geographic identifiers do not uniquely identify each row. 

176 kwargs 

177 A specification of the geometry that we want data for. For example, 

178 `state = "*", county = "*"` will download county-level data for 

179 the entire US. 

180 

181 Returns 

182 ------- 

183 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data. 

184 """ 

185 group_list = self.groups_to_download() 

186 

187 groups = [group for group, leaves_only in group_list if not leaves_only] 

188 leaves_of_groups = [group for group, leaves_only in group_list if leaves_only] 

189 

190 if len(groups) == 0: 

191 groups = None 

192 

193 if len(leaves_of_groups) == 0: 

194 leaves_of_groups = None 

195 

196 # Our download might be scoped to be contained 

197 # within some other geometries. 

198 if contained_within: 

199 download_scope = contained_within 

200 else: 

201 download_scope = ced 

202 

203 df_or_gdf = download_scope.download( 

204 dataset=dataset, 

205 vintage=vintage, 

206 download_variables=self.variables_to_download(), 

207 group=groups, 

208 leaves_of_group=leaves_of_groups, 

209 set_to_nan=set_to_nan, 

210 skip_annotations=skip_annotations, 

211 with_geometry=with_geometry, 

212 remove_water=remove_water, 

213 api_key=api_key, 

214 row_keys=row_keys, 

215 **kwargs, 

216 ) 

217 

218 self.synthesize(df_or_gdf) 

219 

220 return df_or_gdf 

221 

222 @classmethod 

223 def _yaml_loader(cls): 

224 loader = yaml.SafeLoader 

225 loader.add_constructor("!VariableList", _class_constructor(VariableList)) 

226 loader.add_constructor("!Group", _class_constructor(CensusGroup)) 

227 loader.add_constructor("!SpecCollection", _variable_spec_collection_constructor) 

228 return loader 

229 

230 @classmethod 

231 def load_yaml(cls, path: Union[str, Path]): 

232 """Load a YAML file containing a `VariableSpec`.""" 

233 loader = cls._yaml_loader() 

234 

235 loaded = yaml.load(open(path, "rb"), Loader=loader) 

236 

237 return loaded 

238 

239 

240class VariableList(VariableSpec): 

241 """ 

242 Specification of a list of variables to download from the U.S. Census API. 

243 

244 Parameters 

245 ---------- 

246 variables 

247 The variables to download. 

248 denominator 

249 The denominator to divide by when constructing fractional variables. 

250 If `False` then no fractional variables are added. If the name of a 

251 variable, that variable will be downloaded and used as a denominator 

252 to compute fractional versions of all of the other variables. If `True` 

253 then the denominator will be computed as the sum of all the other 

254 variables. 

255 frac_prefix 

256 The prefix to prepend to fractional variables. If `None` a default 

257 prefix of `'frac_'` is used. 

258 """ 

259 

260 def __init__( 

261 self, 

262 variables: Union[str, Iterable[str]], 

263 *, 

264 denominator: Union[str, bool] = False, 

265 frac_prefix: Optional[str] = None, 

266 frac_not: Optional[bool] = False, 

267 ): 

268 super().__init__( 

269 denominator=denominator, frac_prefix=frac_prefix, frac_not=frac_not 

270 ) 

271 if isinstance(variables, str): 

272 self._variables = [variables] 

273 else: 

274 self._variables = list(variables) 

275 

276 def variables_to_download(self) -> List[str]: 

277 """ 

278 Return a list of the variables that need to be downloaded from the U.S. Census API. 

279 

280 This consists of the variables passed at construction time, and a denominator 

281 variable if one was specified. 

282 """ 

283 if ( 

284 isinstance(self.denominator, str) 

285 and self.denominator not in self._variables 

286 ): 

287 # We specified a specific denominator that was not already 

288 # one of the variables, so get it. 

289 return self._variables + [self.denominator] 

290 else: 

291 # We don't need to fetch an extra variable for the denominator. 

292 return self._variables 

293 

294 def synthesize(self, df_downloaded: Union[pd.DataFrame, gpd.GeoDataFrame]): 

295 """ 

296 Post-process after downloading to compute variables like fractional variables are constructed. 

297 

298 This is where fractional variables are generated. 

299 

300 Parameters 

301 ---------- 

302 df_downloaded 

303 A data frame of variables that were downloaded. Any systhesized variables 

304 are added as new columns. 

305 

306 Returns 

307 ------- 

308 None. Any additions are made in-place in `df_downloaded`. 

309 """ 

310 if not self.denominator: 

311 return df_downloaded 

312 

313 if isinstance(self.denominator, str): 

314 for variable in self._variables: 

315 frac = df_downloaded[variable] / df_downloaded[self.denominator] 

316 if self.frac_not: 

317 df_downloaded[f"{self.frac_prefix}{variable}"] = 1.0 - frac 

318 else: 

319 df_downloaded[f"{self.frac_prefix}{variable}"] = frac 

320 elif self.denominator: 

321 denominator = df_downloaded[self._variables].sum(axis="columns") 

322 for variable in self._variables: 

323 frac = df_downloaded[variable] / denominator 

324 if self.frac_not: 

325 df_downloaded[f"{self.frac_prefix}{variable}"] = 1.0 - frac 

326 else: 

327 df_downloaded[f"{self.frac_prefix}{variable}"] = frac 

328 

329 def __eq__(self, other) -> bool: 

330 """Are two `VariableList`'s equal.""" 

331 if not isinstance(other, VariableList): 

332 return False 

333 

334 return ( 

335 sorted(self._variables) == sorted(other._variables) 

336 and self.denominator == other.denominator 

337 ) 

338 

339 

340class CensusGroup(VariableSpec): 

341 """ 

342 Specification of a group of variables to download from the U.S. Census API. 

343 

344 Parameters 

345 ---------- 

346 group 

347 The name of a census group, such as `B03002`, or a list of several 

348 such groups. 

349 leaves_only 

350 If `True`, then only download the variables that are at the leaves of 

351 the group, not the internal variables. 

352 denominator 

353 The denominator to divide by when constructing fractional variables. 

354 If `False` then no fractional variables are added. If the name of a 

355 variable, that variable will be downloaded and used as a denominator 

356 to compute fractional versions of all of the other variables. If `True` 

357 then the denominator will be computed as the sum of all the other 

358 variables. 

359 frac_prefix 

360 The prefix to prepend to fractional variables. If `None` a default 

361 prefix of `'frac_'` is used. 

362 """ 

363 

364 def __init__( 

365 self, 

366 group: Union[str, Iterable[str]], 

367 *, 

368 leaves_only: bool = False, 

369 denominator: Optional[str] = None, 

370 frac_prefix: Optional[str] = None, 

371 frac_not: bool = False, 

372 ): 

373 if denominator is None: 

374 denominator = False 

375 

376 super().__init__( 

377 denominator=denominator, frac_prefix=frac_prefix, frac_not=frac_not 

378 ) 

379 self._group = [group] if isinstance(group, str) else list(group) 

380 self._leaves_only = leaves_only 

381 

382 def groups_to_download(self) -> List[Tuple[str, bool]]: 

383 """ 

384 Return the names of groups of variables that need to be downloaded from the U.S. Census API. 

385 

386 The returned value are simply the groups specificed at construction time. 

387 

388 Returns 

389 ------- 

390 The names of groups to download. 

391 """ 

392 return [(group, self._leaves_only) for group in self._group] 

393 

394 def synthesize(self, df_downloaded: Union[pd.DataFrame, gpd.GeoDataFrame]): 

395 """ 

396 Post-process after downloading to compute variables like fractional variables are constructed. 

397 

398 This is where fractional variables are generated. 

399 

400 Parameters 

401 ---------- 

402 df_downloaded 

403 A data frame of variables that were downloaded. Any systhesized variables 

404 are added as new columns. 

405 

406 Returns 

407 ------- 

408 None. Any additions are made in-place in `df_downloaded`. 

409 """ 

410 if isinstance(self.denominator, str): 

411 for group in self._group: 

412 for variable in df_downloaded.columns: 

413 if variable.startswith(group): 

414 frac = df_downloaded[variable] / df_downloaded[self.denominator] 

415 if self.frac_not: 

416 df_downloaded[f"{self.frac_prefix}{variable}"] = 1.0 - frac 

417 else: 

418 df_downloaded[f"{self.frac_prefix}{variable}"] = frac 

419 elif self.denominator: 

420 for group in self._group: 

421 denominator = df_downloaded[ 

422 [ 

423 variable 

424 for variable in df_downloaded.columns 

425 if variable.startswith(group) 

426 ] 

427 ].sum(axis="columns") 

428 for variable in df_downloaded.columns: 

429 if variable.startswith(group): 

430 frac = df_downloaded[variable] / denominator 

431 if self.frac_not: 

432 df_downloaded[f"{self.frac_prefix}{variable}"] = 1.0 - frac 

433 else: 

434 df_downloaded[f"{self.frac_prefix}{variable}"] = frac 

435 

436 def __eq__(self, other) -> bool: 

437 """Are two `CensusGroup`'s equal.""" 

438 if not isinstance(other, CensusGroup): 

439 return False 

440 

441 return ( 

442 sorted(self._group) == sorted(other._group) 

443 and self.denominator == other.denominator 

444 and self._leaves_only == other._leaves_only 

445 ) 

446 

447 

448class VariableSpecCollection(VariableSpec): 

449 """ 

450 Specification built on top of a collection of other :py:class:`~VariableSpec`s. 

451 

452 When downloading, all the groups and all the variables 

453 specified in any of the constituent specs will be 

454 downloaded. 

455 

456 Parameters 

457 ---------- 

458 variable_specs 

459 A collection of other :py:class:`~VariableSpec`s. 

460 """ 

461 

462 def __init__(self, variable_specs: Iterable[VariableSpec]): 

463 super().__init__(denominator=None) 

464 self._variable_specs = list(variable_specs) 

465 

466 def variables_to_download(self) -> List[str]: 

467 """ 

468 Return a list of the variables that need to be downloaded from the U.S. Census API. 

469 

470 Returns all the variables to be downloaded by the :py:class:`~VariableSpec`'s 

471 in the collection. 

472 """ 

473 return list( 

474 set( 

475 itertools.chain( 

476 *[spec.variables_to_download() for spec in self._variable_specs] 

477 ) 

478 ) 

479 ) 

480 

481 def groups_to_download(self) -> List[Tuple[str, bool]]: 

482 """ 

483 Return the names of groups of variables that need to be downloaded from the U.S. Census API. 

484 

485 The result is a list of the unique groups returned by all the :py:class:`~VariableSpec`'s 

486 given at construction time. 

487 

488 Returns 

489 ------- 

490 The names of groups to download. 

491 """ 

492 return list( 

493 set( 

494 itertools.chain( 

495 *[spec.groups_to_download() for spec in self._variable_specs] 

496 ) 

497 ) 

498 ) 

499 

500 def synthesize(self, df_downloaded: Union[pd.DataFrame, gpd.GeoDataFrame]): 

501 """ 

502 Post-process after downloading to compute variables like fractional variables are constructed. 

503 

504 We do this by calling `synthesize` on each of our constituent variable specifications. 

505 

506 Parameters 

507 ---------- 

508 df_downloaded 

509 A data frame of variables that were downloaded. Any systhesized variables 

510 are added as new columns. 

511 

512 Returns 

513 ------- 

514 None. Any additions are made in-place in `df_downloaded`. 

515 """ 

516 df = df_downloaded 

517 for spec in self._variable_specs: 

518 spec.synthesize(df) 

519 

520 def __eq__(self, other) -> bool: 

521 """Are two `VariableSpecCollection`s equal.""" 

522 if not isinstance(other, VariableSpecCollection): 

523 return False 

524 

525 if len(self._variable_specs) != len(other._variable_specs): 

526 return False 

527 

528 matched = set() 

529 

530 # Does every spec in self have a unique match in other? 

531 for self_spec in self._variable_specs: 

532 match = False 

533 # We use ii to record those in other that have been 

534 # matched so we don't try to match again. 

535 for ii, other_spec in enumerate(self._variable_specs): 

536 if ii not in matched and self_spec == other_spec: 

537 match = True 

538 matched.add(ii) 

539 break 

540 if not match: 

541 return False 

542 

543 return True 

544 

545 

546def _variable_spec_collection_constructor( 

547 loader: yaml.SafeLoader, node: yaml.nodes.SequenceNode 

548) -> VariableSpecCollection: 

549 """Construct a variable spec collection.""" 

550 variable_specs = loader.construct_sequence(node, deep=True) 

551 return VariableSpecCollection(variable_specs) 

552 

553 

554class DataSpec: 

555 """ 

556 A specification for what data we want from the U.S. Census API. 

557 

558 In order to download data we must know the data set and vintage 

559 and have one or more :py:class:`~VariableSpec`s that tell us 

560 what variables we need and what synthetic variables to create, 

561 for example fractional variables. 

562 

563 Parameters 

564 ---------- 

565 dataset 

566 The dataset to download from. For example `"acs/acs5"`, 

567 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are 

568 symbolic names for datasets, like `ACS5` for `"acs/acs5" 

569 in :py:module:`censusdis.datasets`. 

570 vintage 

571 The vintage to download data for. For most data sets this is 

572 an integer year, for example, `2020`. specs 

573 geography 

574 A specification of the geography, for example `{'state': '*'}` 

575 for all states or `{'state': censusdis.states.NJ, 'county': '*'}` 

576 for all counties in New Jersey. 

577 contained_within 

578 An optional specification for the geometry the results should be 

579 contained within. For example, we could select a CBSA here and 

580 put wildcards for state and county in `geography` to get all counties 

581 contained within the CBSA. We need this in cases like this because 

582 CBSAs are off-spine while states and counties are on-spine. 

583 area_threshold 

584 How much of the area of a geometry must be contained in an outer 

585 geometry for it to be included. 

586 with_geometry 

587 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row 

588 will have a geometry that is a cartographic boundary suitable for platting 

589 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html 

590 for details of the shapefiles that will be downloaded on your behalf to 

591 generate these boundaries. 

592 remove_water 

593 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and 

594 remove water areas from returned geometry. 

595 """ 

596 

597 def __init__( 

598 self, 

599 dataset: str, 

600 vintage: VintageType, 

601 specs: Union[VariableSpec, Iterable[VariableSpec]], 

602 geography: Dict[str, Union[str, List[str]]], 

603 *, 

604 contained_within: Optional[Dict[str, Union[str, List[str]]]] = None, 

605 area_threshold: float = 0.8, 

606 with_geometry: bool = False, 

607 remove_water: bool = False, 

608 ): 

609 # Map symbolic names or use what we are given if there is no mapping. 

610 self._dataset = getattr(censusdis.datasets, dataset, dataset) 

611 self._vintage = vintage 

612 # If it is a raw list construct a collection around it. 

613 self._variable_spec = ( 

614 specs if isinstance(specs, VariableSpec) else VariableSpecCollection(specs) 

615 ) 

616 self._geography = self.map_state_and_county_names(geography) 

617 

618 if contained_within is None: 

619 self._contained_within = None 

620 else: 

621 contained_within = self.map_state_and_county_names(contained_within) 

622 self._contained_within = ced.ContainedWithin( 

623 area_threshold, **contained_within 

624 ) 

625 

626 self._with_geometry = with_geometry 

627 self._remove_water = remove_water 

628 

629 @classmethod 

630 def map_state_and_county_names( 

631 cls, geography: Dict[str, Union[str, List[str]]] 

632 ) -> Dict[str, Union[str, List[str]]]: 

633 """If there is a state and optionally counties a geography, try to map them.""" 

634 

635 def map_state(state: str) -> str: 

636 """Map the name if a symbolic name exists.""" 

637 return getattr(censusdis.states, state, state) 

638 

639 def _map_county(state: str): 

640 """Construct a function to map counties in a state.""" 

641 state_symbol = ( 

642 censusdis.states.NAMES_FROM_IDS[state].lower().replace(" ", "_") 

643 ) 

644 

645 state_county_module = import_module(f"censusdis.counties.{state_symbol}") 

646 

647 def map_county(county: str): 

648 """Map a county in the given state.""" 

649 county = getattr(state_county_module, county, county) 

650 return county 

651 

652 return map_county 

653 

654 # If there is no 'state' in geography there is nothing to do. 

655 # If there is a 'state', we copy the dict and do the mapping. 

656 if "state" in geography: 

657 geography = dict(geography) 

658 

659 # We might need to map the symbol. 

660 if isinstance(geography["state"], str): 

661 geography["state"] = map_state(geography["state"]) 

662 

663 if isinstance(geography["state"], str): 

664 # There is a single state, so there might be counties 

665 # underneath it that need mapping. 

666 if "county" in geography and geography["state"] != "*": 

667 map_county = _map_county(geography["state"]) 

668 if isinstance(geography["county"], str): 

669 geography["county"] = map_county(geography["county"]) 

670 else: 

671 geography["county"] = [ 

672 map_county(county) for county in geography["county"] 

673 ] 

674 else: 

675 geography["state"] = [map_state(state) for state in geography["state"]] 

676 

677 return geography 

678 

679 @property 

680 def dataset(self) -> str: 

681 """What data set to query.""" 

682 return self._dataset 

683 

684 @property 

685 def vintage(self) -> VintageType: 

686 """What vintage.""" 

687 return self._vintage 

688 

689 @property 

690 def with_geometry(self) -> bool: 

691 """Do we want to download geometry as well as data so we can plot maps.""" 

692 return self._with_geometry 

693 

694 @property 

695 def remove_water(self) -> bool: 

696 """Should we improve the geometry by masking off water.""" 

697 return self._remove_water 

698 

699 @property 

700 def variable_spec(self) -> VariableSpec: 

701 """The specification of variables to download.""" 

702 return self._variable_spec 

703 

704 @property 

705 def geography(self) -> Dict[str, Union[str, List[str]]]: 

706 """What geography to download data for.""" 

707 return self._geography 

708 

709 @property 

710 def contained_within(self) -> Union[None, ced.ContainedWithin]: 

711 """What geometry are we contained within.""" 

712 return self._contained_within 

713 

714 def download( 

715 self, 

716 api_key: Optional[str] = None, 

717 ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 

718 """ 

719 Download the data we want from the U.S. Census API. 

720 

721 Parameters 

722 ---------- 

723 api_key 

724 An optional API key. If you don't have or don't use a key, the number 

725 of calls you can make will be limited to 500 per day. 

726 

727 Returns 

728 ------- 

729 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data. 

730 """ 

731 return self._variable_spec.download( 

732 dataset=self.dataset, 

733 vintage=self._vintage, 

734 with_geometry=self._with_geometry, 

735 contained_within=self._contained_within, 

736 remove_water=self._remove_water, 

737 api_key=api_key, 

738 **self._geography, 

739 ) 

740 

741 @classmethod 

742 def _yaml_loader(cls): 

743 loader = VariableSpec._yaml_loader() 

744 loader.add_constructor("!DataSpec", _class_constructor(cls)) 

745 return loader 

746 

747 @classmethod 

748 def load_yaml(cls, path: Union[str, Path]): 

749 """Load a YAML file containing a `DataSpec`.""" 

750 loader = cls._yaml_loader() 

751 

752 loaded = yaml.load(open(path, "rb"), Loader=loader) 

753 

754 return loaded 

755 

756 

757class PlotSpec: 

758 """ 

759 A specification for how to plot data we downloaded. 

760 

761 Parameters 

762 ---------- 

763 variable 

764 What variable to plot. Specify this to shade geographies 

765 based on the value of the variable. Leave out and set `boundary=True` 

766 to plot boundaries instead. 

767 boundary 

768 Should we plot boundaries instead of filled geographies? 

769 If `True`, `variable` should not be specified. 

770 title 

771 A title for the plot. 

772 with_background 

773 If `True`, plot over a background map. 

774 legend 

775 If `True` and plotting a variable (not a boundary) then add a legend. 

776 legend_format 

777 How to format the numbers on the legend. The options are 

778 '"float"', `"int"`, `"dollar"`, `"percent"`, or a format string like `"${x:.2f}"` 

779 to choose any Python string format you want. 

780 projection 

781 What projection to use. `"US"` means move AK, HI, and PR. `None` means 

782 use what the map is already in. Anything else is interpreted as an EPSG. 

783 plot_kwargs 

784 Additional keyword args for matplotlib to use in plotting. 

785 """ 

786 

787 def __init__( 

788 self, 

789 *, 

790 variable: Optional[str] = None, 

791 boundary: bool = False, 

792 title: Optional[str] = None, 

793 with_background: bool = False, 

794 plot_kwargs: Optional[Dict[str, Any]] = None, 

795 projection: Optional[str] = None, 

796 legend: bool = True, 

797 legend_format: Optional[str] = None, 

798 ): 

799 if variable is None and not boundary: 

800 raise ValueError("Must specify either `variable=` or `boundary=True`") 

801 if variable is not None and boundary: 

802 raise ValueError("Must specify only one of `variable=` or `boundary=True`") 

803 

804 if projection is None: 

805 projection = "US" 

806 

807 self._variable = variable 

808 self._boundary = boundary 

809 self._title = title 

810 self._legend = legend 

811 self._legend_format = legend_format 

812 self._with_background = with_background 

813 if plot_kwargs is None: 

814 plot_kwargs: Dict[str, Any] = {} 

815 self._plot_kwargs = plot_kwargs 

816 self._projection = projection 

817 

818 @property 

819 def variable(self) -> Union[str, None]: 

820 """What variable will we plot.""" 

821 return self._variable 

822 

823 @property 

824 def boundary(self) -> bool: 

825 """Should we plot boundaries instead of a variable.""" 

826 return self._boundary 

827 

828 @property 

829 def with_background(self) -> bool: 

830 """Should we plot a background map from Open Street Maps.""" 

831 return self._with_background 

832 

833 @property 

834 def plot_kwargs(self) -> Dict[str, Any]: 

835 """ 

836 Additional keyword args to control the plot. 

837 

838 e.g. `{'figsize': [12, 8]} to change the default size of the plot. 

839 """ 

840 return self._plot_kwargs 

841 

842 @property 

843 def title(self): 

844 """The plot title.""" 

845 return self._title 

846 

847 @property 

848 def legend(self): 

849 """Is there a legend.""" 

850 return self._legend 

851 

852 @property 

853 def legend_format(self): 

854 """Format for the legend numbers.""" 

855 return self._legend_format 

856 

857 @property 

858 def projection(self): 

859 """What projection to use when plotting.""" 

860 return self._projection 

861 

862 def __eq__(self, other) -> bool: 

863 """Are two `PlotSpec`'s equal.""" 

864 if not isinstance(other, PlotSpec): 

865 return False 

866 

867 return ( 

868 self._variable == other._variable 

869 and self._boundary == other._boundary 

870 and self._with_background == other._with_background 

871 and self._projection == other._projection 

872 and self._title == other._title 

873 and self._legend == other._legend 

874 and self._legend_format == other._legend_format 

875 and self._plot_kwargs == other._plot_kwargs 

876 ) 

877 

878 _LEGEND_FORMATS: Dict[str, Tuple[str, float]] = { 

879 "dollar": ("${x:,.0f}", 1.0), 

880 "int": ("{x:,.0f}", 1.0), 

881 "float": ("{x:,}", 1.0), 

882 "percent": ("{x:.0f}%", 100), 

883 } 

884 

885 def _final_legend_format(self): 

886 return self._LEGEND_FORMATS.get(self._legend_format, (self._legend_format, 1.0)) 

887 

888 def plot(self, gdf: gpd.GeoDataFrame, ax=None): 

889 """ 

890 Plot data on a map according to the specification. 

891 

892 Parameters 

893 ---------- 

894 gdf 

895 The data to plot. 

896 ax 

897 Optional existing ax to plot on top of. 

898 

899 Returns 

900 ------- 

901 `ax` of the plot. 

902 """ 

903 final_legend_format, legend_scale = self._final_legend_format() 

904 

905 legend_kwds = ( 

906 None 

907 if self._boundary or not self._legend or self._legend_format is None 

908 else {"format": StrMethodFormatter(final_legend_format)} 

909 ) 

910 

911 if self._projection in ["US", "us", "U.S."]: 

912 if self._boundary: 

913 ax = cem.plot_us_boundary( 

914 gdf, 

915 self._variable, 

916 with_background=self._with_background, 

917 do_relocate_ak_hi_pr=True, 

918 ax=ax, 

919 **self._plot_kwargs, 

920 ) 

921 else: 

922 gdf["_scaled_var"] = gdf[self._variable] * legend_scale 

923 ax = cem.plot_us( 

924 gdf, 

925 "_scaled_var", 

926 with_background=self._with_background, 

927 do_relocate_ak_hi_pr=True, 

928 legend=self._legend, 

929 legend_kwds=legend_kwds, 

930 ax=ax, 

931 **self._plot_kwargs, 

932 ) 

933 else: 

934 gdf = gdf.to_crs(epsg=self._projection) 

935 

936 if self._boundary: 

937 gdf = gdf.boundary 

938 else: 

939 gdf["_scaled_var"] = gdf[self._variable] * legend_scale 

940 

941 ax = cem.plot_map( 

942 gdf, 

943 self._variable if self._boundary else "_scaled_var", 

944 with_background=self._with_background, 

945 legend=self._legend and not self._boundary, 

946 legend_kwds=legend_kwds, 

947 ax=ax, 

948 **self.plot_kwargs, 

949 ) 

950 

951 if self._title is not None: 

952 ax.set_title(self._title) 

953 

954 return ax 

955 

956 @classmethod 

957 def _yaml_loader(cls): 

958 loader = yaml.SafeLoader 

959 loader.add_constructor("!PlotSpec", _class_constructor(cls)) 

960 return loader 

961 

962 @classmethod 

963 def load_yaml(cls, path: Union[str, Path]) -> "PlotSpec": 

964 """Load a YAML file containing a `PlotSpec`.""" 

965 loader = cls._yaml_loader() 

966 

967 loaded = yaml.load(open(path, "rb"), Loader=loader) 

968 

969 return loaded