Coverage for censusdis/impl/varcache.py: 87%

2"""Variable cache code to cache metatada about variables locally."""

4from collections import defaultdict

5from logging import getLogger

6from typing import (

7 Any,

8 DefaultDict,

9 Dict,

10 Generator,

11 Iterable,

12 List,

13 Optional,

14 Tuple,

15 Union,

16)

18import numpy as np

19import pandas as pd

21from censusdis import CensusApiException

22from censusdis.impl.varsource.base import VariableSource

23from censusdis.impl.varsource.censusapi import CensusApiVariableSource

25import censusdis.datasets

27import re

30logger = getLogger(__name__)

33class VariableCache:

34 """

35 A cache of variables and groups.

37 This looks a lot like a :py:class:`~VariableSource` but it

38 implements a cache in front of a :py:class:`~VariableSource`.

40 Users will rarely if ever need to construct one of these

41 themselves. In almost all cases they will use the singleton

42 `censusdis.censusdata.variables`.

43 """

45 def __init__(self, *, variable_source: Optional[VariableSource] = None):

46 if variable_source is None:

47 variable_source = CensusApiVariableSource()

49 self._variable_source = variable_source

50 self._variable_cache: DefaultDict[str, DefaultDict[int, Dict[str, Any]]] = (

51 defaultdict(lambda: defaultdict(dict))

52 )

53 self._group_cache: DefaultDict[str, DefaultDict[int, Dict[str, Any]]] = (

54 defaultdict(lambda: defaultdict(dict))

55 )

57 self._all_data_sets_cache: Optional[pd.DataFrame] = None

58 self._data_sets_by_year_cache: Dict[int, pd.DataFrame] = {}

60 def get(

61 self,

62 dataset: str,

63 year: int,

64 name: str,

65 ) -> Dict[str, Dict]:

66 """

67 Get the description of a given variable.

69 See :py:meth:`VariableSource.get`

70 for details on the data format. We first look in the cache and then if

71 we don't find what we are looking for, we call the source behind us and

72 cache the results before returning them.

74 Parameters

75 ----------

76 dataset

77 The census dataset.

78 year

79 The year

80 name

81 The name of the variable.

83 Returns

84 -------

85 The details of the variable.

86 """

87 cached_value = self._variable_cache[dataset][year].get(name, None)

89 if cached_value is not None:

90 return cached_value

92 value = self._variable_source.get(dataset, year, name)

94 self._variable_cache[dataset][year][name] = value

96 return value

98 def get_group(

99 self,

100 dataset: str,

101 year: int,

102 name: Optional[str],

103 skip_subgroup_variables: bool = True,

104 ) -> Dict[str, Dict]:

105 """

106 Get information on the variables in a group.

107

108 Parameters

109 ----------

110 dataset

111 The census dataset.

112 year

113 The year

114 name

115 The name of the group. Or None if this data set does not have

116 groups.

117 skip_subgroup_variables

118 If this is `True`, then we will ignore variables from alphabetical

119 subgroups. These are relatively common in the ACS, where there are

120 groups like `B01001` that have subgroups `B01001A`, `B01001B` and

121 so on. The underlying census API sometimes reports variables like

122 `B01001A_001E` from these as members of `B01001` and other times as

123 members of `B01001A`. Setting this `True`, which is the default,

124 does not report `B01001A_001E` when the group name `name='B01001'`.

125

126 Returns

127 -------

128 A dictionary that maps from the names of each variable in the group

129 to a dictionary containing a description of the variable. The

130 format of the description is a dictionary as described in

131 the documentation for

132 :py:meth:`VariableSource.get`.

133 """

134 group_variable_names = self._group_cache[dataset][year].get(name, None)

135

136 if group_variable_names is None:

137 # Missed in the cache, so go fetch it.

138 value = self._variable_source.get_group(dataset, year, name)

139

140 # Cache all the variables in the group.

141 group_variables = value["variables"]

142

143 for variable_name, variable_details in group_variables.items():

144 self._variable_cache[dataset][year][variable_name] = variable_details

145

146 # Cache the names of the variables in the group.

147 group_variable_names = list(

148 variable_name for variable_name in group_variables

149 )

150 self._group_cache[dataset][year][name] = group_variable_names

151

152 # Optionally filter out the variables that are in

153 # alphabetical subgroups.

154 if skip_subgroup_variables and name is not None:

155 subgroup_var_pattern = re.compile(f"^{name}[A-Z]_.*$")

156 group_variable_names = [

157 group_variable_name

158 for group_variable_name in group_variable_names

159 if not (

160 group_variable_name.startswith(name)

161 and subgroup_var_pattern.match(group_variable_name)

162 )

163 ]

164

165 # Reformat what we return so it includes the full

166 # details on each variable.

167 return {

168 group_variable_name: self.get(dataset, year, group_variable_name)

169 for group_variable_name in group_variable_names

170 }

171

172 class GroupTreeNode:

173 """A node in a tree of variables that make up a group."""

174

175 def __init__(self, name: Optional[str] = None):

176 self._name = name

177

178 self._children: Dict[str, "VariableCache.GroupTreeNode"] = {}

179

180 @property

181 def name(self):

182 """The name of the node."""

183 return self._name

184

185 @name.setter

186 def name(self, name: Optional[str]):

187 self._name = name

188

189 def add_child(self, path_component: str, child: "VariableCache.GroupTreeNode"):

190 """

191 Add a child to a node.

192

193 Parameters

194 ----------

195 path_component

196 The next component of the path to the variable, beyond the

197 path to `self`.

198 child

199 The node that should be our child at the specified path component.

200

201 Returns

202 -------

203 None

204 """

205 self._children[path_component] = child

206

207 def is_leaf(self) -> bool:

208 """

209 Is the node a leaf.

210

211 Returns

212 -------

213 `True` if it is a leaf; `False` if it is an internal node.

214 """

215 return len(self._children) == 0

216

217 def __len__(self):

218 """Return the number of children the node has."""

219 return len(self._children)

220

221 def __contains__(self, component: str):

222 """Determine if a component is in the node."""

223 return component in self._children

224

225 def __getitem__(self, component: str):

226 """Get a child of a node."""

227 return self._children[component]

228

229 def keys(self) -> Generator[str, None, None]:

230 """

231 Return the keys, which are the strings of the next component to each child.

232

233 Returns

234 -------

235 The keys

236 """

237 for key, _ in self.items():

238 yield key

239

240 def values(self) -> Generator["VariableCache.GroupTreeNode", None, None]:

241 """

242 Return the values, which are our children.

243

244 Returns

245 -------

246 The values (our children).

247 """

248 for _, value in self.items():

249 yield value

250

251 def items(

252 self,

253 ) -> Generator[Tuple[str, "VariableCache.GroupTreeNode"], None, None]:

254 """

255 Retudn the items.

256

257 The items are (key, value) pairs. See :py:meth:`keys` and

258 :py:meth:`values`.

259

260 Returns

261 -------

262 The items

263 """

264 for component, node in self._children.items():

265 yield component, node

266

267 def get(

268 self, component, default: Optional["VariableCache.GroupTreeNode"] = None

269 ):

270 """

271 Get the child at the given path component below us.

272

273 Parameters

274 ----------

275 component

276 The next component of the path below us.

277 default

278 The default value to return if there is no node at the path.

279

280 Returns

281 -------

282 The node below us or `default` if it is not there,

283 """

284 return self._children.get(component, default)

285

286 def leaves(self) -> Generator["VariableCache.GroupTreeNode", None, None]:

287 """

288 Return all the leaves below us.

289

290 Compare with :py:meth:`~leaf_variables`

291 which returns just the names of the leaves.

292

293 Returns

294 -------

295 All the leaves below us.

296 """

297 if self.is_leaf():

298 yield self

299 for child in self._children.values():

300 yield from child.leaves()

301

302 def leaf_variables(self) -> Generator[str, None, None]:

303 """

304 Return the names of all the leaves below us.

305

306 Compare with :py:meth:`~leaves`

307 which returns the full node for each leaf.

308

309 Returns

310 -------

311 The names of the leaves below us.

312 """

313 yield from (leaf.name for leaf in self.leaves())

314

315 @property

316 def min_leaf_name(self) -> str:

317 """The name of the first leaf."""

318 return min(self.leaf_variables())

319

320 def _node_str(self, level: int, component: str, indent_prefix: str) -> str:

321 line = indent_prefix * level

322 if len(self._children) > 0 or self._name is not None:

323 line = f"{line}+ {component}"

324 if self.name is not None:

325 line = f"{line} ({self.name})"

326

327 return line

328

329 def subtree_str(self, level: int, component: str, indent_prefix: str) -> str:

330 """

331 Return a string representing a subtree.

332

333 Used to construct an indented string representation for the whole tree.

334 """

335 rep = self._node_str(level, component, indent_prefix)

336 for path_component, child in sorted(

337 self._children.items(), key=lambda t: t[1].min_leaf_name

338 ):

339 rep = (

340 rep

341 + "\n"

342 + child.subtree_str(level + 1, path_component, indent_prefix)

343 )

344 return rep

345

346 def __str__(self) -> str:

347 """Return a string representation of the node."""

348 return "\n".join(

349 child.subtree_str(0, path_component, indent_prefix=" ")

350 for path_component, child in sorted(

351 self._children.items(), key=lambda t: t[1].min_leaf_name

352 )

353 )

354

355 def __repr__(self) -> str:

356 """Return the representation of the node."""

357 return str(self)

358

359 def _all_data_sets(self) -> pd.DataFrame:

360 """

361 Get all the data sets.

362

363 Cache to avoid repeated remote calls.

364

365 Returns

366 -------

367 A data frame of all the data sets for all years.

368 """

369 if self._all_data_sets_cache is None:

370 datasets = self._variable_source.get_datasets(year=None)

371

372 self._all_data_sets_cache = self._datasets_from_source_dict(datasets)

373

374 return self._all_data_sets_cache

375

376 def _data_sets_for_year(self, year: int) -> pd.DataFrame:

377 """

378 Get all data sets for a given year.

379

380 Cache to avoid repeated remote calls.

381

382 Parameters

383 ----------

384 year

385 The year to query. If not provided, all data sets for all

386 years are queried.

387

388 Returns

389 -------

390 A data frame of all the data sets for the year.

391 """

392 if year not in self._data_sets_by_year_cache:

393 datasets = self._variable_source.get_datasets(year)

394

395 self._data_sets_by_year_cache[year] = self._datasets_from_source_dict(

396 datasets

397 )

398

399 return self._data_sets_by_year_cache[year]

400

401 @staticmethod

402 def _datasets_from_source_dict(datasets) -> pd.DataFrame:

403 """

404 Parse a dict from :py:meth:`VariableSource.get_datasets` into a data frame of data sets.

405

406 Parameters

407 ----------

408 datasets

409 The data sets in dictionary form.

410

411 Returns

412 -------

413 A dataframe with a row describing each dataset.

414 """

415 datasets = datasets["dataset"]

416 df_datasets = pd.DataFrame(

417 [

418 {

419 "YEAR": dataset.get(

420 "c_vintage",

421 "timeseries" if dataset.get("c_isTimeseries", False) else None,

422 ),

423 "DATASET": "/".join(dataset["c_dataset"]),

424 "TITLE": dataset.get("title", None),

425 "DESCRIPTION": dataset.get("description", None),

426 "API BASE URL": (

427 dataset["distribution"][0].get("accessURL", None)

428 if dataset.get("distribution")

429 else None

430 ),

431 }

432 for dataset in datasets

433 ]

434 )

435

436 symbol_dict_reversed = {

437 value: symbol

438 for symbol, value in censusdis.datasets.__dict__.items()

439 if isinstance(value, str)

440 }

441

442 df_datasets["SYMBOL"] = df_datasets["DATASET"].apply(

443 lambda name: symbol_dict_reversed.get(name, None)

444 )

445

446 df_datasets = df_datasets[

447 ["YEAR", "SYMBOL", "DATASET"]

448 + [

449 col

450 for col in df_datasets.columns

451 if col not in ["YEAR", "SYMBOL", "DATASET"]

452 ]

453 ]

454

455 return df_datasets.sort_values(["YEAR", "DATASET"]).reset_index(drop=True)

456

457 def all_data_sets(self, *, year: Optional[int] = None) -> pd.DataFrame:

458 """

459 Retrieve a description of available data sets.

460

461 Parameters

462 ----------

463 year

464 The year to query. If not provided, all data sets for all

465 years are queried.

466

467 Returns

468 -------

469 A data frame describing the data sets that are available.

470 """

471 if year is not None:

472 return self._data_sets_for_year(year)

473

474 return self._all_data_sets()

475

476 @staticmethod

477 def _compile_pattern(pattern, case):

478 """Compile patterns passed to search methods with case flag."""

479 if isinstance(pattern, str):

480 flags = re.IGNORECASE if not case else 0

481 pattern = re.compile(pattern, flags=flags)

482 return pattern

483

484 def search_data_sets(

485 self,

486 *,

487 vintage: Optional[Union[int, Iterable[int]]] = None,

488 pattern: Optional[Union[str, re.Pattern]] = None,

489 case: bool = False,

490 ) -> pd.DataFrame:

491 """

492 Search for data sets over one or more vintages.

493

494 Parameters

495 ----------

496 vintage

497 One or more Vintages to explore.

498 pattern

499 A regular expression to match against the name and description of a variable. This

500 is used to filter down results. Normally at most one of `name` and `re` will be used.

501 case:

502 If `patters` is not `None` then indicates whether the regular expression match is

503 case sensitive. Does not affect the `name` match.

504

505 Returns

506 -------

507 A data frame of matching variables.

508 """

509 if vintage is None or isinstance(vintage, int):

510 vintage = [vintage]

511

512 df_datasets = pd.concat(

513 (self.all_data_sets(year=year) for year in vintage), ignore_index=True

514 )

515

516 if pattern is not None:

517 pattern = self._compile_pattern(pattern, case)

518

519 df_matches = df_datasets[

520 df_datasets["SYMBOL"].str.contains(pattern)

521 | df_datasets["DATASET"].str.contains(pattern)

522 | df_datasets["TITLE"].str.contains(pattern)

523 | df_datasets["DESCRIPTION"].str.contains(pattern)

524 ]

525 else:

526 df_matches = df_datasets

527

528 return df_matches.reset_index(drop=True)

529

530 def all_groups(

531 self,

532 dataset: str,

533 year: int,

534 ) -> pd.DataFrame:

535 """

536 Get descriptions of all the groups in the data set.

537

538 Parameters

539 ----------

540 dataset

541 The data set.

542 year

543 The year.

544

545 Returns

546 -------

547 Metadata on all the groups in the data set.

548 """

549 groups = self._variable_source.get_all_groups(dataset, year)

550

551 # Some data sets have no groups.

552 if len(groups["groups"]) == 0:

553 return pd.DataFrame(columns=["DATASET", "YEAR", "GROUP", "DESCRIPTION"])

554

555 return (

556 pd.DataFrame(

557 [

558 {

559 "DATASET": dataset,

560 "YEAR": year,

561 "GROUP": group["name"],

562 "DESCRIPTION": group["description"],

563 }

564 for group in groups["groups"]

565 ]

566 )

567 .sort_values(["DATASET", "YEAR", "GROUP"])

568 .reset_index(drop=True)

569 )

570

571 def search_groups(

572 self,

573 dataset: str,

574 vintage: Union[int, Iterable[int]],

575 *,

576 pattern: Optional[Union[str, re.Pattern]] = None,

577 case: bool = False,

578 ) -> pd.DataFrame:

579 """

580 Search for groups in a data set over one or more vintages.

581

582 Parameters

583 ----------

584 dataset

585 The data set.

586 vintage

587 One or more Vintages to explore.

588 pattern

589 A regular expression to match against the name and description of a variable. This

590 is used to filter down results. Normally at most one of `name` and `re` will be used.

591 case:

592 If `patters` is not `None` then indicates whether the regular expression match is

593 case sensitive. Does not affect the `name` match.

594

595 Returns

596 -------

597 A data frame of matching variables.

598 """

599 if vintage is None or isinstance(vintage, int):

600 vintage = [vintage]

601

602 def _all_groups_eat_404(year: int):

603 """

604 Skip bad year and return no results.

605

606 We assume it is a bad year if we get a 404.

607 """

608 try:

609 return self.all_groups(dataset, year)

610 except CensusApiException as e:

611 if "404" in str(e):

612 return pd.DataFrame()

613 else:

614 raise e

615

616 df_groups = pd.concat(

617 (_all_groups_eat_404(year) for year in vintage), ignore_index=True

618 )

619

620 if pattern is not None:

621 pattern = self._compile_pattern(pattern, case)

622

623 df_matches = df_groups[

624 df_groups["GROUP"].str.contains(pattern)

625 | df_groups["DESCRIPTION"].str.contains(pattern)

626 ]

627 else:

628 df_matches = df_groups

629

630 return df_matches.reset_index(drop=True)

631

632 def all_variables(

633 self,

634 dataset: str,

635 year: int,

636 group_name: Optional[str],

637 *,

638 skip_annotations: bool = True,

639 skip_subgroup_variables: bool = True,

640 ) -> pd.DataFrame:

641 """

642 Produce a data frame of metadata on all variables in a group.

643

644 Parameters

645 ----------

646 dataset

647 The data set.

648 year

649 The year.

650 group_name

651 The group.

652 skip_annotations

653 If `True` try to filter out variables that are

654 annotations rather than actual values, by skipping

655 those with labels that begin with "Annotation" or

656 "Margin of Error".

657 skip_subgroup_variables

658 If this is `True`, then we will ignore variables from alphabetical

659 subgroups. These are relatively common in the ACS, where there are

660 groups like `B01001` that have subgroups `B01001A`, `B01001B` and

661 so on. The underlying census API sometimes reports variables like

662 `B01001A_001E` from these as members of `B01001` and other times as

663 members of `B01001A`. Setting this `True`, which is the default,

664 does not report `B01001A_001E` when the group name `name='B01001'`.

665

666 Returns

667 -------

668 Metadata on all variables in the group.

669 """

670 group_variables = self.group_variables(

671 dataset,

672 year,

673 group_name,

674 skip_annotations=skip_annotations,

675 skip_subgroup_variables=skip_subgroup_variables,

676 )

677

678 def variable_items(variable_dict: Dict) -> Optional[Dict[str, str]]:

679 if "values" in variable_dict:

680 values = variable_dict["values"]

681 return values.get("item", np.nan)

682

683 return None

684

685 return pd.DataFrame(

686 [

687 {

688 "YEAR": year,

689 "DATASET": dataset,

690 "GROUP": self.get(dataset, year, variable_name).get(

691 "group", np.nan

692 ),

693 "VARIABLE": variable_name,

694 "LABEL": self.get(dataset, year, variable_name)["label"],

695 "SUGGESTED_WEIGHT": self.get(dataset, year, variable_name).get(

696 "suggested-weight", np.nan

697 ),

698 "VALUES": variable_items(self.get(dataset, year, variable_name)),

699 }

700 for variable_name in group_variables

701 ]

702 )

703

704 def search(

705 self,

706 dataset: str,

707 vintage: Union[int, Iterable[int]],

708 *,

709 group_name: Optional[str] = None,

710 name: Optional[Union[str, Iterable[str]]] = None,

711 pattern: Optional[Union[str, re.Pattern]] = None,

712 case: bool = False,

713 skip_annotations: bool = True,

714 skip_subgroup_variables: bool = True,

715 ) -> pd.DataFrame:

716 """

717 Search for variables in a data set over one or more vintages.

718

719 Parameters

720 ----------

721 dataset

722 The data set.

723 vintage

724 One or more Vintages to explore.

725 group_name

726 The group if we should explore only a single group. If `None` all groups

727 will be explored.

728 name

729 The name of one of more variables to explore. If `None` all variables are considered.

730 Normally at most one of `name` and `re` will be used.

731 pattern

732 A regular expression to match against the name and description of a variable. This

733 is used to filter down results. Normally at most one of `name` and `re` will be used.

734 case:

735 If `patters` is not `None` then indicates whether the regular expression match is

736 case sensitive. Does not affect the `name` match.

737 skip_annotations

738 If `True` try to filter out variables that are

739 annotations rather than actual values, by skipping

740 those with labels that begin with "Annotation" or

741 "Margin of Error".

742 skip_subgroup_variables

743 If this is `True`, then we will ignore variables from alphabetical

744 subgroups. These are relatively common in the ACS, where there are

745 groups like `B01001` that have subgroups `B01001A`, `B01001B` and

746 so on. The underlying census API sometimes reports variables like

747 `B01001A_001E` from these as members of `B01001` and other times as

748 members of `B01001A`. Setting this `True`, which is the default,

749 does not report `B01001A_001E` when the group name `name='B01001'`.

750

751 Returns

752 -------

753 A data frame of matching variables.

754 """

755 if isinstance(vintage, int):

756 vintage = [vintage]

757

758 def _all_variables_eat_404(year: int):

759 """

760 Skip bad year and return no results.

761

762 We assume it is a bad year if we get a 404.

763 """

764 try:

765 return self.all_variables(

766 dataset,

767 year,

768 group_name,

769 skip_annotations=skip_annotations,

770 skip_subgroup_variables=skip_subgroup_variables,

771 )

772 except CensusApiException as e:

773 if "404" in str(e):

774 return pd.DataFrame()

775 else:

776 raise e

777

778 df_all_variables = pd.concat(

779 (_all_variables_eat_404(year) for year in vintage), ignore_index=True

780 )

781

782 # If we were given names to match on, then match on them.

783 if name is not None:

784 if isinstance(name, str):

785 name = [name]

786

787 df_name_matches = pd.concat(

788 [

789 df_all_variables[df_all_variables["VARIABLE"] == var_name]

790 for var_name in name

791 ],

792 axis="rows",

793 )

794 else:

795 df_name_matches = df_all_variables

796

797 if pattern is not None:

798 pattern = self._compile_pattern(pattern, case)

799

800 df_matches = df_name_matches[

801 (

802 df_name_matches["VARIABLE"].str.contains(pattern)

803 | df_name_matches["LABEL"].str.contains(pattern)

804 )

805 ]

806 else:

807 df_matches = df_name_matches

808

809 return pd.DataFrame(df_matches).reset_index(drop=True)

810

811 def group_tree(

812 self,

813 dataset: str,

814 year: int,

815 group_name: Optional[str],

816 *,

817 skip_annotations: bool = True,

818 ) -> "VariableCache.GroupTreeNode":

819 """

820 Construct a tree that embodies the parent/child relationships of all the variables in a group.

821

822 Parameters

823 ----------

824 dataset

825 The data set.

826 year

827 The year.

828 group_name

829 The group.

830 skip_annotations

831 If `True`, skip variables that are annotations of others, like

832 margin of error.

833

834 Returns

835 -------

836 A tree that can be printed or walked.

837 """

838 group = self.get_group(dataset, year, group_name)

839

840 root = VariableCache.GroupTreeNode()

841

842 for variable_name, details in group.items():

843 path = details["label"].split("!!")

844

845 if skip_annotations and (

846 path[0].startswith("Annotation")

847 or path[0].startswith("Margin of Error")

848 or path[0].startswith("Statistical Significance")

849 ):

850 continue

851

852 node = root

853

854 # Construct a nested path of nodes down to the

855 # leaf.

856 for component in path:

857 child = node.get(component, None)

858 if child is None:

859 child = VariableCache.GroupTreeNode()

860 node.add_child(component, child)

861 node = child

862

863 # Put the variable name at the lead.

864 node.name = variable_name

865

866 return root

867

868 def group_leaves(

869 self, dataset: str, year: int, name: str, *, skip_annotations: bool = True

870 ) -> List[str]:

871 """

872 Find the leaves of a given group.

873

874 Parameters

875 ----------

876 dataset

877 The census dataset.

878 year

879 The year

880 name

881 The name of the group.

882 skip_annotations

883 If `True` try to filter out variables that are

884 annotations rather than actual values, by skipping

885 those with labels that begin with "Annotation" or

886 "Margin of Error".

887

888 Returns

889 -------

890 A list of the variables in the group that are leaves,

891 i.e. they are not aggregates of other variables. For example,

892 in the group `B03002` in the `acs/acs5` dataset in the

893 year `2020`, the variable `B03002_003E` is a leaf, because

894 it represents

895 "Estimate!!Total:!!Not Hispanic or Latino:!!White alone",

896 whereas B03002_002E is not a leaf because it represents

897 "Estimate!!Total:!!Not Hispanic or Latino:", which is a total

898 that includes B03002_003E as well as others like "B03002_004E",

899 "B03002_005E" and more.

900

901 The typical reason we want leaves is because that gives us a set

902 of variables representing counts that do not overlap and add up

903 to the total. We can use these directly in diversity and integration

904 calculations using the `divintseg` package.

905 """

906 tree = self.group_tree(dataset, year, name)

907

908 leaves = tree.leaf_variables()

909

910 if skip_annotations:

911 group = self.get_group(dataset, year, name)

912 leaves = (

913 leaf

914 for leaf in leaves

915 if (not group[leaf]["label"].startswith("Annotation"))

916 and (not group[leaf]["label"].startswith("Margin of Error"))

917 )

918

919 return sorted(leaves)

920

921 def group_variables(

922 self,

923 dataset: str,

924 year: int,

925 group_name: str,

926 *,

927 skip_annotations: bool = True,

928 skip_subgroup_variables: bool = True,

929 ) -> List[str]:

930 """

931 Find the variables of a given group.

932

933 Parameters

934 ----------

935 dataset

936 The census dataset.

937 year

938 The year

939 group_name

940 The name of the group.

941 skip_annotations

942 If `True` try to filter out variables that are

943 annotations rather than actual values, by skipping

944 those with labels that begin with "Annotation" or

945 "Margin of Error".

946 skip_subgroup_variables

947 If this is `True`, then we will ignore variables from alphabetical

948 subgroups. These are relatively common in the ACS, where there are

949 groups like `B01001` that have subgroups `B01001A`, `B01001B` and

950 so on. The underlying census API sometimes reports variables like

951 `B01001A_001E` from these as members of `B01001` and other times as

952 members of `B01001A`. Setting this `True`, which is the default,

953 does not report `B01001A_001E` when the group name `name='B01001'`.

954

955 Returns

956 -------

957 A list of the variables in the group.

958 """

959 tree = self.get_group(

960 dataset, year, group_name, skip_subgroup_variables=skip_subgroup_variables

961 )

962

963 if skip_annotations:

964 group_variables = [

965 k

966 for k, v in tree.items()

967 if (not v["label"].startswith("Annotation"))

968 and (not v["label"].startswith("Margin of Error"))

969 ]

970 else:

971 group_variables = list(tree.keys())

972

973 return sorted(group_variables)

974

975 def __contains__(self, item: Tuple[str, int, str]) -> bool:

976 """Magic method behind the `in` operator."""

977 source, year, name = item

978

979 return name in self._variable_cache[source][year]

980

981 def __getitem__(self, item: Tuple[str, int, str]):

982 """Magic method behind the `[]` operator."""

983 return self.get(*item)

984

985 def __len__(self):

986 """Return he number of elements in the cache."""

987 return sum(

988 len(names)

989 for years in self._variable_cache.values()

990 for names in years.values()

991 )

992

993 def keys(self) -> Iterable[Tuple[str, int, str]]:

994 """Keys, i.e. the names of variables, in the cache."""

995 for key, _ in self.items():

996 yield key

997

998 def __iter__(self) -> Iterable[Tuple[str, int, str]]:

999 """Return an iterator over the keys."""

1000 return self.keys()

1001

1002 def values(self) -> Iterable[dict]:

1003 """Values, i.e. the descriptions of variables, in the cache."""

1004 for _, value in self.items():

1005 yield value

1006

1007 def items(self) -> Iterable[Tuple[Tuple[str, int, str], dict]]:

1008 """Items in the mapping from variable name to descpription."""

1009 for source, values_for_source in self._variable_cache.items():

1010 for year, values_for_year in values_for_source.items():

1011 for name, value in values_for_year.items():

1012 yield (source, year, name), value

1013

1014 def invalidate(self, dataset: str, year: int, name: str):

1015 """Remove an item from the cache."""

1016 if self._variable_cache[dataset][year].pop(name, None):

1017 if len(self._variable_cache[dataset][year]) == 0:

1018 self._variable_cache[dataset].pop(year)

1019 if len(self._variable_cache[dataset]) == 0:

1020 self._variable_cache.pop(dataset)

1021

1022 def clear(self):

1023 """

1024 Clear the entire cache.

1025

1026 This just means that further calls to :py:meth:`~get` will

1027 have to make a call to the source behind the cache.

1028 """

1029 self._variable_cache = defaultdict(lambda: defaultdict(dict))