Coverage for censusdis/impl/varcache.py: 87%

242 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-04-03 05:39 +0000

1# Copyright (c) 2022 Darren Erik Vengroff 

2"""Variable cache code to cache metatada about variables locally.""" 

3 

4from collections import defaultdict 

5from logging import getLogger 

6from typing import ( 

7 Any, 

8 DefaultDict, 

9 Dict, 

10 Generator, 

11 Iterable, 

12 List, 

13 Optional, 

14 Tuple, 

15 Union, 

16) 

17 

18import numpy as np 

19import pandas as pd 

20 

21from censusdis import CensusApiException 

22from censusdis.impl.varsource.base import VariableSource 

23from censusdis.impl.varsource.censusapi import CensusApiVariableSource 

24 

25import censusdis.datasets 

26 

27import re 

28 

29 

30logger = getLogger(__name__) 

31 

32 

33class VariableCache: 

34 """ 

35 A cache of variables and groups. 

36 

37 This looks a lot like a :py:class:`~VariableSource` but it 

38 implements a cache in front of a :py:class:`~VariableSource`. 

39 

40 Users will rarely if ever need to construct one of these 

41 themselves. In almost all cases they will use the singleton 

42 `censusdis.censusdata.variables`. 

43 """ 

44 

45 def __init__(self, *, variable_source: Optional[VariableSource] = None): 

46 if variable_source is None: 

47 variable_source = CensusApiVariableSource() 

48 

49 self._variable_source = variable_source 

50 self._variable_cache: DefaultDict[str, DefaultDict[int, Dict[str, Any]]] = ( 

51 defaultdict(lambda: defaultdict(dict)) 

52 ) 

53 self._group_cache: DefaultDict[str, DefaultDict[int, Dict[str, Any]]] = ( 

54 defaultdict(lambda: defaultdict(dict)) 

55 ) 

56 

57 self._all_data_sets_cache: Optional[pd.DataFrame] = None 

58 self._data_sets_by_year_cache: Dict[int, pd.DataFrame] = {} 

59 

60 def get( 

61 self, 

62 dataset: str, 

63 year: int, 

64 name: str, 

65 ) -> Dict[str, Dict]: 

66 """ 

67 Get the description of a given variable. 

68 

69 See :py:meth:`VariableSource.get` 

70 for details on the data format. We first look in the cache and then if 

71 we don't find what we are looking for, we call the source behind us and 

72 cache the results before returning them. 

73 

74 Parameters 

75 ---------- 

76 dataset 

77 The census dataset. 

78 year 

79 The year 

80 name 

81 The name of the variable. 

82 

83 Returns 

84 ------- 

85 The details of the variable. 

86 """ 

87 cached_value = self._variable_cache[dataset][year].get(name, None) 

88 

89 if cached_value is not None: 

90 return cached_value 

91 

92 value = self._variable_source.get(dataset, year, name) 

93 

94 self._variable_cache[dataset][year][name] = value 

95 

96 return value 

97 

98 def get_group( 

99 self, 

100 dataset: str, 

101 year: int, 

102 name: Optional[str], 

103 skip_subgroup_variables: bool = True, 

104 ) -> Dict[str, Dict]: 

105 """ 

106 Get information on the variables in a group. 

107 

108 Parameters 

109 ---------- 

110 dataset 

111 The census dataset. 

112 year 

113 The year 

114 name 

115 The name of the group. Or None if this data set does not have 

116 groups. 

117 skip_subgroup_variables 

118 If this is `True`, then we will ignore variables from alphabetical 

119 subgroups. These are relatively common in the ACS, where there are 

120 groups like `B01001` that have subgroups `B01001A`, `B01001B` and 

121 so on. The underlying census API sometimes reports variables like 

122 `B01001A_001E` from these as members of `B01001` and other times as 

123 members of `B01001A`. Setting this `True`, which is the default, 

124 does not report `B01001A_001E` when the group name `name='B01001'`. 

125 

126 Returns 

127 ------- 

128 A dictionary that maps from the names of each variable in the group 

129 to a dictionary containing a description of the variable. The 

130 format of the description is a dictionary as described in 

131 the documentation for 

132 :py:meth:`VariableSource.get`. 

133 """ 

134 group_variable_names = self._group_cache[dataset][year].get(name, None) 

135 

136 if group_variable_names is None: 

137 # Missed in the cache, so go fetch it. 

138 value = self._variable_source.get_group(dataset, year, name) 

139 

140 # Cache all the variables in the group. 

141 group_variables = value["variables"] 

142 

143 for variable_name, variable_details in group_variables.items(): 

144 self._variable_cache[dataset][year][variable_name] = variable_details 

145 

146 # Cache the names of the variables in the group. 

147 group_variable_names = list( 

148 variable_name for variable_name in group_variables 

149 ) 

150 self._group_cache[dataset][year][name] = group_variable_names 

151 

152 # Optionally filter out the variables that are in 

153 # alphabetical subgroups. 

154 if skip_subgroup_variables and name is not None: 

155 subgroup_var_pattern = re.compile(f"^{name}[A-Z]_.*$") 

156 group_variable_names = [ 

157 group_variable_name 

158 for group_variable_name in group_variable_names 

159 if not ( 

160 group_variable_name.startswith(name) 

161 and subgroup_var_pattern.match(group_variable_name) 

162 ) 

163 ] 

164 

165 # Reformat what we return so it includes the full 

166 # details on each variable. 

167 return { 

168 group_variable_name: self.get(dataset, year, group_variable_name) 

169 for group_variable_name in group_variable_names 

170 } 

171 

172 class GroupTreeNode: 

173 """A node in a tree of variables that make up a group.""" 

174 

175 def __init__(self, name: Optional[str] = None): 

176 self._name = name 

177 

178 self._children: Dict[str, "VariableCache.GroupTreeNode"] = {} 

179 

180 @property 

181 def name(self): 

182 """The name of the node.""" 

183 return self._name 

184 

185 @name.setter 

186 def name(self, name: Optional[str]): 

187 self._name = name 

188 

189 def add_child(self, path_component: str, child: "VariableCache.GroupTreeNode"): 

190 """ 

191 Add a child to a node. 

192 

193 Parameters 

194 ---------- 

195 path_component 

196 The next component of the path to the variable, beyond the 

197 path to `self`. 

198 child 

199 The node that should be our child at the specified path component. 

200 

201 Returns 

202 ------- 

203 None 

204 """ 

205 self._children[path_component] = child 

206 

207 def is_leaf(self) -> bool: 

208 """ 

209 Is the node a leaf. 

210 

211 Returns 

212 ------- 

213 `True` if it is a leaf; `False` if it is an internal node. 

214 """ 

215 return len(self._children) == 0 

216 

217 def __len__(self): 

218 """Return the number of children the node has.""" 

219 return len(self._children) 

220 

221 def __contains__(self, component: str): 

222 """Determine if a component is in the node.""" 

223 return component in self._children 

224 

225 def __getitem__(self, component: str): 

226 """Get a child of a node.""" 

227 return self._children[component] 

228 

229 def keys(self) -> Generator[str, None, None]: 

230 """ 

231 Return the keys, which are the strings of the next component to each child. 

232 

233 Returns 

234 ------- 

235 The keys 

236 """ 

237 for key, _ in self.items(): 

238 yield key 

239 

240 def values(self) -> Generator["VariableCache.GroupTreeNode", None, None]: 

241 """ 

242 Return the values, which are our children. 

243 

244 Returns 

245 ------- 

246 The values (our children). 

247 """ 

248 for _, value in self.items(): 

249 yield value 

250 

251 def items( 

252 self, 

253 ) -> Generator[Tuple[str, "VariableCache.GroupTreeNode"], None, None]: 

254 """ 

255 Retudn the items. 

256 

257 The items are (key, value) pairs. See :py:meth:`keys` and 

258 :py:meth:`values`. 

259 

260 Returns 

261 ------- 

262 The items 

263 """ 

264 for component, node in self._children.items(): 

265 yield component, node 

266 

267 def get( 

268 self, component, default: Optional["VariableCache.GroupTreeNode"] = None 

269 ): 

270 """ 

271 Get the child at the given path component below us. 

272 

273 Parameters 

274 ---------- 

275 component 

276 The next component of the path below us. 

277 default 

278 The default value to return if there is no node at the path. 

279 

280 Returns 

281 ------- 

282 The node below us or `default` if it is not there, 

283 """ 

284 return self._children.get(component, default) 

285 

286 def leaves(self) -> Generator["VariableCache.GroupTreeNode", None, None]: 

287 """ 

288 Return all the leaves below us. 

289 

290 Compare with :py:meth:`~leaf_variables` 

291 which returns just the names of the leaves. 

292 

293 Returns 

294 ------- 

295 All the leaves below us. 

296 """ 

297 if self.is_leaf(): 

298 yield self 

299 for child in self._children.values(): 

300 yield from child.leaves() 

301 

302 def leaf_variables(self) -> Generator[str, None, None]: 

303 """ 

304 Return the names of all the leaves below us. 

305 

306 Compare with :py:meth:`~leaves` 

307 which returns the full node for each leaf. 

308 

309 Returns 

310 ------- 

311 The names of the leaves below us. 

312 """ 

313 yield from (leaf.name for leaf in self.leaves()) 

314 

315 @property 

316 def min_leaf_name(self) -> str: 

317 """The name of the first leaf.""" 

318 return min(self.leaf_variables()) 

319 

320 def _node_str(self, level: int, component: str, indent_prefix: str) -> str: 

321 line = indent_prefix * level 

322 if len(self._children) > 0 or self._name is not None: 

323 line = f"{line}+ {component}" 

324 if self.name is not None: 

325 line = f"{line} ({self.name})" 

326 

327 return line 

328 

329 def subtree_str(self, level: int, component: str, indent_prefix: str) -> str: 

330 """ 

331 Return a string representing a subtree. 

332 

333 Used to construct an indented string representation for the whole tree. 

334 """ 

335 rep = self._node_str(level, component, indent_prefix) 

336 for path_component, child in sorted( 

337 self._children.items(), key=lambda t: t[1].min_leaf_name 

338 ): 

339 rep = ( 

340 rep 

341 + "\n" 

342 + child.subtree_str(level + 1, path_component, indent_prefix) 

343 ) 

344 return rep 

345 

346 def __str__(self) -> str: 

347 """Return a string representation of the node.""" 

348 return "\n".join( 

349 child.subtree_str(0, path_component, indent_prefix=" ") 

350 for path_component, child in sorted( 

351 self._children.items(), key=lambda t: t[1].min_leaf_name 

352 ) 

353 ) 

354 

355 def __repr__(self) -> str: 

356 """Return the representation of the node.""" 

357 return str(self) 

358 

359 def _all_data_sets(self) -> pd.DataFrame: 

360 """ 

361 Get all the data sets. 

362 

363 Cache to avoid repeated remote calls. 

364 

365 Returns 

366 ------- 

367 A data frame of all the data sets for all years. 

368 """ 

369 if self._all_data_sets_cache is None: 

370 datasets = self._variable_source.get_datasets(year=None) 

371 

372 self._all_data_sets_cache = self._datasets_from_source_dict(datasets) 

373 

374 return self._all_data_sets_cache 

375 

376 def _data_sets_for_year(self, year: int) -> pd.DataFrame: 

377 """ 

378 Get all data sets for a given year. 

379 

380 Cache to avoid repeated remote calls. 

381 

382 Parameters 

383 ---------- 

384 year 

385 The year to query. If not provided, all data sets for all 

386 years are queried. 

387 

388 Returns 

389 ------- 

390 A data frame of all the data sets for the year. 

391 """ 

392 if year not in self._data_sets_by_year_cache: 

393 datasets = self._variable_source.get_datasets(year) 

394 

395 self._data_sets_by_year_cache[year] = self._datasets_from_source_dict( 

396 datasets 

397 ) 

398 

399 return self._data_sets_by_year_cache[year] 

400 

401 @staticmethod 

402 def _datasets_from_source_dict(datasets) -> pd.DataFrame: 

403 """ 

404 Parse a dict from :py:meth:`VariableSource.get_datasets` into a data frame of data sets. 

405 

406 Parameters 

407 ---------- 

408 datasets 

409 The data sets in dictionary form. 

410 

411 Returns 

412 ------- 

413 A dataframe with a row describing each dataset. 

414 """ 

415 datasets = datasets["dataset"] 

416 df_datasets = pd.DataFrame( 

417 [ 

418 { 

419 "YEAR": dataset.get( 

420 "c_vintage", 

421 "timeseries" if dataset.get("c_isTimeseries", False) else None, 

422 ), 

423 "DATASET": "/".join(dataset["c_dataset"]), 

424 "TITLE": dataset.get("title", None), 

425 "DESCRIPTION": dataset.get("description", None), 

426 "API BASE URL": ( 

427 dataset["distribution"][0].get("accessURL", None) 

428 if dataset.get("distribution") 

429 else None 

430 ), 

431 } 

432 for dataset in datasets 

433 ] 

434 ) 

435 

436 symbol_dict_reversed = { 

437 value: symbol 

438 for symbol, value in censusdis.datasets.__dict__.items() 

439 if isinstance(value, str) 

440 } 

441 

442 df_datasets["SYMBOL"] = df_datasets["DATASET"].apply( 

443 lambda name: symbol_dict_reversed.get(name, None) 

444 ) 

445 

446 df_datasets = df_datasets[ 

447 ["YEAR", "SYMBOL", "DATASET"] 

448 + [ 

449 col 

450 for col in df_datasets.columns 

451 if col not in ["YEAR", "SYMBOL", "DATASET"] 

452 ] 

453 ] 

454 

455 return df_datasets.sort_values(["YEAR", "DATASET"]).reset_index(drop=True) 

456 

457 def all_data_sets(self, *, year: Optional[int] = None) -> pd.DataFrame: 

458 """ 

459 Retrieve a description of available data sets. 

460 

461 Parameters 

462 ---------- 

463 year 

464 The year to query. If not provided, all data sets for all 

465 years are queried. 

466 

467 Returns 

468 ------- 

469 A data frame describing the data sets that are available. 

470 """ 

471 if year is not None: 

472 return self._data_sets_for_year(year) 

473 

474 return self._all_data_sets() 

475 

476 @staticmethod 

477 def _compile_pattern(pattern, case): 

478 """Compile patterns passed to search methods with case flag.""" 

479 if isinstance(pattern, str): 

480 flags = re.IGNORECASE if not case else 0 

481 pattern = re.compile(pattern, flags=flags) 

482 return pattern 

483 

484 def search_data_sets( 

485 self, 

486 *, 

487 vintage: Optional[Union[int, Iterable[int]]] = None, 

488 pattern: Optional[Union[str, re.Pattern]] = None, 

489 case: bool = False, 

490 ) -> pd.DataFrame: 

491 """ 

492 Search for data sets over one or more vintages. 

493 

494 Parameters 

495 ---------- 

496 vintage 

497 One or more Vintages to explore. 

498 pattern 

499 A regular expression to match against the name and description of a variable. This 

500 is used to filter down results. Normally at most one of `name` and `re` will be used. 

501 case: 

502 If `patters` is not `None` then indicates whether the regular expression match is 

503 case sensitive. Does not affect the `name` match. 

504 

505 Returns 

506 ------- 

507 A data frame of matching variables. 

508 """ 

509 if vintage is None or isinstance(vintage, int): 

510 vintage = [vintage] 

511 

512 df_datasets = pd.concat( 

513 (self.all_data_sets(year=year) for year in vintage), ignore_index=True 

514 ) 

515 

516 if pattern is not None: 

517 pattern = self._compile_pattern(pattern, case) 

518 

519 df_matches = df_datasets[ 

520 df_datasets["SYMBOL"].str.contains(pattern) 

521 | df_datasets["DATASET"].str.contains(pattern) 

522 | df_datasets["TITLE"].str.contains(pattern) 

523 | df_datasets["DESCRIPTION"].str.contains(pattern) 

524 ] 

525 else: 

526 df_matches = df_datasets 

527 

528 return df_matches.reset_index(drop=True) 

529 

530 def all_groups( 

531 self, 

532 dataset: str, 

533 year: int, 

534 ) -> pd.DataFrame: 

535 """ 

536 Get descriptions of all the groups in the data set. 

537 

538 Parameters 

539 ---------- 

540 dataset 

541 The data set. 

542 year 

543 The year. 

544 

545 Returns 

546 ------- 

547 Metadata on all the groups in the data set. 

548 """ 

549 groups = self._variable_source.get_all_groups(dataset, year) 

550 

551 # Some data sets have no groups. 

552 if len(groups["groups"]) == 0: 

553 return pd.DataFrame(columns=["DATASET", "YEAR", "GROUP", "DESCRIPTION"]) 

554 

555 return ( 

556 pd.DataFrame( 

557 [ 

558 { 

559 "DATASET": dataset, 

560 "YEAR": year, 

561 "GROUP": group["name"], 

562 "DESCRIPTION": group["description"], 

563 } 

564 for group in groups["groups"] 

565 ] 

566 ) 

567 .sort_values(["DATASET", "YEAR", "GROUP"]) 

568 .reset_index(drop=True) 

569 ) 

570 

571 def search_groups( 

572 self, 

573 dataset: str, 

574 vintage: Union[int, Iterable[int]], 

575 *, 

576 pattern: Optional[Union[str, re.Pattern]] = None, 

577 case: bool = False, 

578 ) -> pd.DataFrame: 

579 """ 

580 Search for groups in a data set over one or more vintages. 

581 

582 Parameters 

583 ---------- 

584 dataset 

585 The data set. 

586 vintage 

587 One or more Vintages to explore. 

588 pattern 

589 A regular expression to match against the name and description of a variable. This 

590 is used to filter down results. Normally at most one of `name` and `re` will be used. 

591 case: 

592 If `patters` is not `None` then indicates whether the regular expression match is 

593 case sensitive. Does not affect the `name` match. 

594 

595 Returns 

596 ------- 

597 A data frame of matching variables. 

598 """ 

599 if vintage is None or isinstance(vintage, int): 

600 vintage = [vintage] 

601 

602 def _all_groups_eat_404(year: int): 

603 """ 

604 Skip bad year and return no results. 

605 

606 We assume it is a bad year if we get a 404. 

607 """ 

608 try: 

609 return self.all_groups(dataset, year) 

610 except CensusApiException as e: 

611 if "404" in str(e): 

612 return pd.DataFrame() 

613 else: 

614 raise e 

615 

616 df_groups = pd.concat( 

617 (_all_groups_eat_404(year) for year in vintage), ignore_index=True 

618 ) 

619 

620 if pattern is not None: 

621 pattern = self._compile_pattern(pattern, case) 

622 

623 df_matches = df_groups[ 

624 df_groups["GROUP"].str.contains(pattern) 

625 | df_groups["DESCRIPTION"].str.contains(pattern) 

626 ] 

627 else: 

628 df_matches = df_groups 

629 

630 return df_matches.reset_index(drop=True) 

631 

632 def all_variables( 

633 self, 

634 dataset: str, 

635 year: int, 

636 group_name: Optional[str], 

637 *, 

638 skip_annotations: bool = True, 

639 skip_subgroup_variables: bool = True, 

640 ) -> pd.DataFrame: 

641 """ 

642 Produce a data frame of metadata on all variables in a group. 

643 

644 Parameters 

645 ---------- 

646 dataset 

647 The data set. 

648 year 

649 The year. 

650 group_name 

651 The group. 

652 skip_annotations 

653 If `True` try to filter out variables that are 

654 annotations rather than actual values, by skipping 

655 those with labels that begin with "Annotation" or 

656 "Margin of Error". 

657 skip_subgroup_variables 

658 If this is `True`, then we will ignore variables from alphabetical 

659 subgroups. These are relatively common in the ACS, where there are 

660 groups like `B01001` that have subgroups `B01001A`, `B01001B` and 

661 so on. The underlying census API sometimes reports variables like 

662 `B01001A_001E` from these as members of `B01001` and other times as 

663 members of `B01001A`. Setting this `True`, which is the default, 

664 does not report `B01001A_001E` when the group name `name='B01001'`. 

665 

666 Returns 

667 ------- 

668 Metadata on all variables in the group. 

669 """ 

670 group_variables = self.group_variables( 

671 dataset, 

672 year, 

673 group_name, 

674 skip_annotations=skip_annotations, 

675 skip_subgroup_variables=skip_subgroup_variables, 

676 ) 

677 

678 def variable_items(variable_dict: Dict) -> Optional[Dict[str, str]]: 

679 if "values" in variable_dict: 

680 values = variable_dict["values"] 

681 return values.get("item", np.nan) 

682 

683 return None 

684 

685 return pd.DataFrame( 

686 [ 

687 { 

688 "YEAR": year, 

689 "DATASET": dataset, 

690 "GROUP": self.get(dataset, year, variable_name).get( 

691 "group", np.nan 

692 ), 

693 "VARIABLE": variable_name, 

694 "LABEL": self.get(dataset, year, variable_name)["label"], 

695 "SUGGESTED_WEIGHT": self.get(dataset, year, variable_name).get( 

696 "suggested-weight", np.nan 

697 ), 

698 "VALUES": variable_items(self.get(dataset, year, variable_name)), 

699 } 

700 for variable_name in group_variables 

701 ] 

702 ) 

703 

704 def search( 

705 self, 

706 dataset: str, 

707 vintage: Union[int, Iterable[int]], 

708 *, 

709 group_name: Optional[str] = None, 

710 name: Optional[Union[str, Iterable[str]]] = None, 

711 pattern: Optional[Union[str, re.Pattern]] = None, 

712 case: bool = False, 

713 skip_annotations: bool = True, 

714 skip_subgroup_variables: bool = True, 

715 ) -> pd.DataFrame: 

716 """ 

717 Search for variables in a data set over one or more vintages. 

718 

719 Parameters 

720 ---------- 

721 dataset 

722 The data set. 

723 vintage 

724 One or more Vintages to explore. 

725 group_name 

726 The group if we should explore only a single group. If `None` all groups 

727 will be explored. 

728 name 

729 The name of one of more variables to explore. If `None` all variables are considered. 

730 Normally at most one of `name` and `re` will be used. 

731 pattern 

732 A regular expression to match against the name and description of a variable. This 

733 is used to filter down results. Normally at most one of `name` and `re` will be used. 

734 case: 

735 If `patters` is not `None` then indicates whether the regular expression match is 

736 case sensitive. Does not affect the `name` match. 

737 skip_annotations 

738 If `True` try to filter out variables that are 

739 annotations rather than actual values, by skipping 

740 those with labels that begin with "Annotation" or 

741 "Margin of Error". 

742 skip_subgroup_variables 

743 If this is `True`, then we will ignore variables from alphabetical 

744 subgroups. These are relatively common in the ACS, where there are 

745 groups like `B01001` that have subgroups `B01001A`, `B01001B` and 

746 so on. The underlying census API sometimes reports variables like 

747 `B01001A_001E` from these as members of `B01001` and other times as 

748 members of `B01001A`. Setting this `True`, which is the default, 

749 does not report `B01001A_001E` when the group name `name='B01001'`. 

750 

751 Returns 

752 ------- 

753 A data frame of matching variables. 

754 """ 

755 if isinstance(vintage, int): 

756 vintage = [vintage] 

757 

758 def _all_variables_eat_404(year: int): 

759 """ 

760 Skip bad year and return no results. 

761 

762 We assume it is a bad year if we get a 404. 

763 """ 

764 try: 

765 return self.all_variables( 

766 dataset, 

767 year, 

768 group_name, 

769 skip_annotations=skip_annotations, 

770 skip_subgroup_variables=skip_subgroup_variables, 

771 ) 

772 except CensusApiException as e: 

773 if "404" in str(e): 

774 return pd.DataFrame() 

775 else: 

776 raise e 

777 

778 df_all_variables = pd.concat( 

779 (_all_variables_eat_404(year) for year in vintage), ignore_index=True 

780 ) 

781 

782 # If we were given names to match on, then match on them. 

783 if name is not None: 

784 if isinstance(name, str): 

785 name = [name] 

786 

787 df_name_matches = pd.concat( 

788 [ 

789 df_all_variables[df_all_variables["VARIABLE"] == var_name] 

790 for var_name in name 

791 ], 

792 axis="rows", 

793 ) 

794 else: 

795 df_name_matches = df_all_variables 

796 

797 if pattern is not None: 

798 pattern = self._compile_pattern(pattern, case) 

799 

800 df_matches = df_name_matches[ 

801 ( 

802 df_name_matches["VARIABLE"].str.contains(pattern) 

803 | df_name_matches["LABEL"].str.contains(pattern) 

804 ) 

805 ] 

806 else: 

807 df_matches = df_name_matches 

808 

809 return pd.DataFrame(df_matches).reset_index(drop=True) 

810 

811 def group_tree( 

812 self, 

813 dataset: str, 

814 year: int, 

815 group_name: Optional[str], 

816 *, 

817 skip_annotations: bool = True, 

818 ) -> "VariableCache.GroupTreeNode": 

819 """ 

820 Construct a tree that embodies the parent/child relationships of all the variables in a group. 

821 

822 Parameters 

823 ---------- 

824 dataset 

825 The data set. 

826 year 

827 The year. 

828 group_name 

829 The group. 

830 skip_annotations 

831 If `True`, skip variables that are annotations of others, like 

832 margin of error. 

833 

834 Returns 

835 ------- 

836 A tree that can be printed or walked. 

837 """ 

838 group = self.get_group(dataset, year, group_name) 

839 

840 root = VariableCache.GroupTreeNode() 

841 

842 for variable_name, details in group.items(): 

843 path = details["label"].split("!!") 

844 

845 if skip_annotations and ( 

846 path[0].startswith("Annotation") 

847 or path[0].startswith("Margin of Error") 

848 or path[0].startswith("Statistical Significance") 

849 ): 

850 continue 

851 

852 node = root 

853 

854 # Construct a nested path of nodes down to the 

855 # leaf. 

856 for component in path: 

857 child = node.get(component, None) 

858 if child is None: 

859 child = VariableCache.GroupTreeNode() 

860 node.add_child(component, child) 

861 node = child 

862 

863 # Put the variable name at the lead. 

864 node.name = variable_name 

865 

866 return root 

867 

868 def group_leaves( 

869 self, dataset: str, year: int, name: str, *, skip_annotations: bool = True 

870 ) -> List[str]: 

871 """ 

872 Find the leaves of a given group. 

873 

874 Parameters 

875 ---------- 

876 dataset 

877 The census dataset. 

878 year 

879 The year 

880 name 

881 The name of the group. 

882 skip_annotations 

883 If `True` try to filter out variables that are 

884 annotations rather than actual values, by skipping 

885 those with labels that begin with "Annotation" or 

886 "Margin of Error". 

887 

888 Returns 

889 ------- 

890 A list of the variables in the group that are leaves, 

891 i.e. they are not aggregates of other variables. For example, 

892 in the group `B03002` in the `acs/acs5` dataset in the 

893 year `2020`, the variable `B03002_003E` is a leaf, because 

894 it represents 

895 "Estimate!!Total:!!Not Hispanic or Latino:!!White alone", 

896 whereas B03002_002E is not a leaf because it represents 

897 "Estimate!!Total:!!Not Hispanic or Latino:", which is a total 

898 that includes B03002_003E as well as others like "B03002_004E", 

899 "B03002_005E" and more. 

900 

901 The typical reason we want leaves is because that gives us a set 

902 of variables representing counts that do not overlap and add up 

903 to the total. We can use these directly in diversity and integration 

904 calculations using the `divintseg` package. 

905 """ 

906 tree = self.group_tree(dataset, year, name) 

907 

908 leaves = tree.leaf_variables() 

909 

910 if skip_annotations: 

911 group = self.get_group(dataset, year, name) 

912 leaves = ( 

913 leaf 

914 for leaf in leaves 

915 if (not group[leaf]["label"].startswith("Annotation")) 

916 and (not group[leaf]["label"].startswith("Margin of Error")) 

917 ) 

918 

919 return sorted(leaves) 

920 

921 def group_variables( 

922 self, 

923 dataset: str, 

924 year: int, 

925 group_name: str, 

926 *, 

927 skip_annotations: bool = True, 

928 skip_subgroup_variables: bool = True, 

929 ) -> List[str]: 

930 """ 

931 Find the variables of a given group. 

932 

933 Parameters 

934 ---------- 

935 dataset 

936 The census dataset. 

937 year 

938 The year 

939 group_name 

940 The name of the group. 

941 skip_annotations 

942 If `True` try to filter out variables that are 

943 annotations rather than actual values, by skipping 

944 those with labels that begin with "Annotation" or 

945 "Margin of Error". 

946 skip_subgroup_variables 

947 If this is `True`, then we will ignore variables from alphabetical 

948 subgroups. These are relatively common in the ACS, where there are 

949 groups like `B01001` that have subgroups `B01001A`, `B01001B` and 

950 so on. The underlying census API sometimes reports variables like 

951 `B01001A_001E` from these as members of `B01001` and other times as 

952 members of `B01001A`. Setting this `True`, which is the default, 

953 does not report `B01001A_001E` when the group name `name='B01001'`. 

954 

955 Returns 

956 ------- 

957 A list of the variables in the group. 

958 """ 

959 tree = self.get_group( 

960 dataset, year, group_name, skip_subgroup_variables=skip_subgroup_variables 

961 ) 

962 

963 if skip_annotations: 

964 group_variables = [ 

965 k 

966 for k, v in tree.items() 

967 if (not v["label"].startswith("Annotation")) 

968 and (not v["label"].startswith("Margin of Error")) 

969 ] 

970 else: 

971 group_variables = list(tree.keys()) 

972 

973 return sorted(group_variables) 

974 

975 def __contains__(self, item: Tuple[str, int, str]) -> bool: 

976 """Magic method behind the `in` operator.""" 

977 source, year, name = item 

978 

979 return name in self._variable_cache[source][year] 

980 

981 def __getitem__(self, item: Tuple[str, int, str]): 

982 """Magic method behind the `[]` operator.""" 

983 return self.get(*item) 

984 

985 def __len__(self): 

986 """Return he number of elements in the cache.""" 

987 return sum( 

988 len(names) 

989 for years in self._variable_cache.values() 

990 for names in years.values() 

991 ) 

992 

993 def keys(self) -> Iterable[Tuple[str, int, str]]: 

994 """Keys, i.e. the names of variables, in the cache.""" 

995 for key, _ in self.items(): 

996 yield key 

997 

998 def __iter__(self) -> Iterable[Tuple[str, int, str]]: 

999 """Return an iterator over the keys.""" 

1000 return self.keys() 

1001 

1002 def values(self) -> Iterable[dict]: 

1003 """Values, i.e. the descriptions of variables, in the cache.""" 

1004 for _, value in self.items(): 

1005 yield value 

1006 

1007 def items(self) -> Iterable[Tuple[Tuple[str, int, str], dict]]: 

1008 """Items in the mapping from variable name to descpription.""" 

1009 for source, values_for_source in self._variable_cache.items(): 

1010 for year, values_for_year in values_for_source.items(): 

1011 for name, value in values_for_year.items(): 

1012 yield (source, year, name), value 

1013 

1014 def invalidate(self, dataset: str, year: int, name: str): 

1015 """Remove an item from the cache.""" 

1016 if self._variable_cache[dataset][year].pop(name, None): 

1017 if len(self._variable_cache[dataset][year]) == 0: 

1018 self._variable_cache[dataset].pop(year) 

1019 if len(self._variable_cache[dataset]) == 0: 

1020 self._variable_cache.pop(dataset) 

1021 

1022 def clear(self): 

1023 """ 

1024 Clear the entire cache. 

1025 

1026 This just means that further calls to :py:meth:`~get` will 

1027 have to make a call to the source behind the cache. 

1028 """ 

1029 self._variable_cache = defaultdict(lambda: defaultdict(dict))