Coverage for censusdis/impl/varcache.py: 87%
242 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
1# Copyright (c) 2022 Darren Erik Vengroff
2"""Variable cache code to cache metatada about variables locally."""
4from collections import defaultdict
5from logging import getLogger
6from typing import (
7 Any,
8 DefaultDict,
9 Dict,
10 Generator,
11 Iterable,
12 List,
13 Optional,
14 Tuple,
15 Union,
16)
18import numpy as np
19import pandas as pd
21from censusdis import CensusApiException
22from censusdis.impl.varsource.base import VariableSource
23from censusdis.impl.varsource.censusapi import CensusApiVariableSource
25import censusdis.datasets
27import re
30logger = getLogger(__name__)
33class VariableCache:
34 """
35 A cache of variables and groups.
37 This looks a lot like a :py:class:`~VariableSource` but it
38 implements a cache in front of a :py:class:`~VariableSource`.
40 Users will rarely if ever need to construct one of these
41 themselves. In almost all cases they will use the singleton
42 `censusdis.censusdata.variables`.
43 """
45 def __init__(self, *, variable_source: Optional[VariableSource] = None):
46 if variable_source is None:
47 variable_source = CensusApiVariableSource()
49 self._variable_source = variable_source
50 self._variable_cache: DefaultDict[str, DefaultDict[int, Dict[str, Any]]] = (
51 defaultdict(lambda: defaultdict(dict))
52 )
53 self._group_cache: DefaultDict[str, DefaultDict[int, Dict[str, Any]]] = (
54 defaultdict(lambda: defaultdict(dict))
55 )
57 self._all_data_sets_cache: Optional[pd.DataFrame] = None
58 self._data_sets_by_year_cache: Dict[int, pd.DataFrame] = {}
60 def get(
61 self,
62 dataset: str,
63 year: int,
64 name: str,
65 ) -> Dict[str, Dict]:
66 """
67 Get the description of a given variable.
69 See :py:meth:`VariableSource.get`
70 for details on the data format. We first look in the cache and then if
71 we don't find what we are looking for, we call the source behind us and
72 cache the results before returning them.
74 Parameters
75 ----------
76 dataset
77 The census dataset.
78 year
79 The year
80 name
81 The name of the variable.
83 Returns
84 -------
85 The details of the variable.
86 """
87 cached_value = self._variable_cache[dataset][year].get(name, None)
89 if cached_value is not None:
90 return cached_value
92 value = self._variable_source.get(dataset, year, name)
94 self._variable_cache[dataset][year][name] = value
96 return value
98 def get_group(
99 self,
100 dataset: str,
101 year: int,
102 name: Optional[str],
103 skip_subgroup_variables: bool = True,
104 ) -> Dict[str, Dict]:
105 """
106 Get information on the variables in a group.
108 Parameters
109 ----------
110 dataset
111 The census dataset.
112 year
113 The year
114 name
115 The name of the group. Or None if this data set does not have
116 groups.
117 skip_subgroup_variables
118 If this is `True`, then we will ignore variables from alphabetical
119 subgroups. These are relatively common in the ACS, where there are
120 groups like `B01001` that have subgroups `B01001A`, `B01001B` and
121 so on. The underlying census API sometimes reports variables like
122 `B01001A_001E` from these as members of `B01001` and other times as
123 members of `B01001A`. Setting this `True`, which is the default,
124 does not report `B01001A_001E` when the group name `name='B01001'`.
126 Returns
127 -------
128 A dictionary that maps from the names of each variable in the group
129 to a dictionary containing a description of the variable. The
130 format of the description is a dictionary as described in
131 the documentation for
132 :py:meth:`VariableSource.get`.
133 """
134 group_variable_names = self._group_cache[dataset][year].get(name, None)
136 if group_variable_names is None:
137 # Missed in the cache, so go fetch it.
138 value = self._variable_source.get_group(dataset, year, name)
140 # Cache all the variables in the group.
141 group_variables = value["variables"]
143 for variable_name, variable_details in group_variables.items():
144 self._variable_cache[dataset][year][variable_name] = variable_details
146 # Cache the names of the variables in the group.
147 group_variable_names = list(
148 variable_name for variable_name in group_variables
149 )
150 self._group_cache[dataset][year][name] = group_variable_names
152 # Optionally filter out the variables that are in
153 # alphabetical subgroups.
154 if skip_subgroup_variables and name is not None:
155 subgroup_var_pattern = re.compile(f"^{name}[A-Z]_.*$")
156 group_variable_names = [
157 group_variable_name
158 for group_variable_name in group_variable_names
159 if not (
160 group_variable_name.startswith(name)
161 and subgroup_var_pattern.match(group_variable_name)
162 )
163 ]
165 # Reformat what we return so it includes the full
166 # details on each variable.
167 return {
168 group_variable_name: self.get(dataset, year, group_variable_name)
169 for group_variable_name in group_variable_names
170 }
172 class GroupTreeNode:
173 """A node in a tree of variables that make up a group."""
175 def __init__(self, name: Optional[str] = None):
176 self._name = name
178 self._children: Dict[str, "VariableCache.GroupTreeNode"] = {}
180 @property
181 def name(self):
182 """The name of the node."""
183 return self._name
185 @name.setter
186 def name(self, name: Optional[str]):
187 self._name = name
189 def add_child(self, path_component: str, child: "VariableCache.GroupTreeNode"):
190 """
191 Add a child to a node.
193 Parameters
194 ----------
195 path_component
196 The next component of the path to the variable, beyond the
197 path to `self`.
198 child
199 The node that should be our child at the specified path component.
201 Returns
202 -------
203 None
204 """
205 self._children[path_component] = child
207 def is_leaf(self) -> bool:
208 """
209 Is the node a leaf.
211 Returns
212 -------
213 `True` if it is a leaf; `False` if it is an internal node.
214 """
215 return len(self._children) == 0
217 def __len__(self):
218 """Return the number of children the node has."""
219 return len(self._children)
221 def __contains__(self, component: str):
222 """Determine if a component is in the node."""
223 return component in self._children
225 def __getitem__(self, component: str):
226 """Get a child of a node."""
227 return self._children[component]
229 def keys(self) -> Generator[str, None, None]:
230 """
231 Return the keys, which are the strings of the next component to each child.
233 Returns
234 -------
235 The keys
236 """
237 for key, _ in self.items():
238 yield key
240 def values(self) -> Generator["VariableCache.GroupTreeNode", None, None]:
241 """
242 Return the values, which are our children.
244 Returns
245 -------
246 The values (our children).
247 """
248 for _, value in self.items():
249 yield value
251 def items(
252 self,
253 ) -> Generator[Tuple[str, "VariableCache.GroupTreeNode"], None, None]:
254 """
255 Retudn the items.
257 The items are (key, value) pairs. See :py:meth:`keys` and
258 :py:meth:`values`.
260 Returns
261 -------
262 The items
263 """
264 for component, node in self._children.items():
265 yield component, node
267 def get(
268 self, component, default: Optional["VariableCache.GroupTreeNode"] = None
269 ):
270 """
271 Get the child at the given path component below us.
273 Parameters
274 ----------
275 component
276 The next component of the path below us.
277 default
278 The default value to return if there is no node at the path.
280 Returns
281 -------
282 The node below us or `default` if it is not there,
283 """
284 return self._children.get(component, default)
286 def leaves(self) -> Generator["VariableCache.GroupTreeNode", None, None]:
287 """
288 Return all the leaves below us.
290 Compare with :py:meth:`~leaf_variables`
291 which returns just the names of the leaves.
293 Returns
294 -------
295 All the leaves below us.
296 """
297 if self.is_leaf():
298 yield self
299 for child in self._children.values():
300 yield from child.leaves()
302 def leaf_variables(self) -> Generator[str, None, None]:
303 """
304 Return the names of all the leaves below us.
306 Compare with :py:meth:`~leaves`
307 which returns the full node for each leaf.
309 Returns
310 -------
311 The names of the leaves below us.
312 """
313 yield from (leaf.name for leaf in self.leaves())
315 @property
316 def min_leaf_name(self) -> str:
317 """The name of the first leaf."""
318 return min(self.leaf_variables())
320 def _node_str(self, level: int, component: str, indent_prefix: str) -> str:
321 line = indent_prefix * level
322 if len(self._children) > 0 or self._name is not None:
323 line = f"{line}+ {component}"
324 if self.name is not None:
325 line = f"{line} ({self.name})"
327 return line
329 def subtree_str(self, level: int, component: str, indent_prefix: str) -> str:
330 """
331 Return a string representing a subtree.
333 Used to construct an indented string representation for the whole tree.
334 """
335 rep = self._node_str(level, component, indent_prefix)
336 for path_component, child in sorted(
337 self._children.items(), key=lambda t: t[1].min_leaf_name
338 ):
339 rep = (
340 rep
341 + "\n"
342 + child.subtree_str(level + 1, path_component, indent_prefix)
343 )
344 return rep
346 def __str__(self) -> str:
347 """Return a string representation of the node."""
348 return "\n".join(
349 child.subtree_str(0, path_component, indent_prefix=" ")
350 for path_component, child in sorted(
351 self._children.items(), key=lambda t: t[1].min_leaf_name
352 )
353 )
355 def __repr__(self) -> str:
356 """Return the representation of the node."""
357 return str(self)
359 def _all_data_sets(self) -> pd.DataFrame:
360 """
361 Get all the data sets.
363 Cache to avoid repeated remote calls.
365 Returns
366 -------
367 A data frame of all the data sets for all years.
368 """
369 if self._all_data_sets_cache is None:
370 datasets = self._variable_source.get_datasets(year=None)
372 self._all_data_sets_cache = self._datasets_from_source_dict(datasets)
374 return self._all_data_sets_cache
376 def _data_sets_for_year(self, year: int) -> pd.DataFrame:
377 """
378 Get all data sets for a given year.
380 Cache to avoid repeated remote calls.
382 Parameters
383 ----------
384 year
385 The year to query. If not provided, all data sets for all
386 years are queried.
388 Returns
389 -------
390 A data frame of all the data sets for the year.
391 """
392 if year not in self._data_sets_by_year_cache:
393 datasets = self._variable_source.get_datasets(year)
395 self._data_sets_by_year_cache[year] = self._datasets_from_source_dict(
396 datasets
397 )
399 return self._data_sets_by_year_cache[year]
401 @staticmethod
402 def _datasets_from_source_dict(datasets) -> pd.DataFrame:
403 """
404 Parse a dict from :py:meth:`VariableSource.get_datasets` into a data frame of data sets.
406 Parameters
407 ----------
408 datasets
409 The data sets in dictionary form.
411 Returns
412 -------
413 A dataframe with a row describing each dataset.
414 """
415 datasets = datasets["dataset"]
416 df_datasets = pd.DataFrame(
417 [
418 {
419 "YEAR": dataset.get(
420 "c_vintage",
421 "timeseries" if dataset.get("c_isTimeseries", False) else None,
422 ),
423 "DATASET": "/".join(dataset["c_dataset"]),
424 "TITLE": dataset.get("title", None),
425 "DESCRIPTION": dataset.get("description", None),
426 "API BASE URL": (
427 dataset["distribution"][0].get("accessURL", None)
428 if dataset.get("distribution")
429 else None
430 ),
431 }
432 for dataset in datasets
433 ]
434 )
436 symbol_dict_reversed = {
437 value: symbol
438 for symbol, value in censusdis.datasets.__dict__.items()
439 if isinstance(value, str)
440 }
442 df_datasets["SYMBOL"] = df_datasets["DATASET"].apply(
443 lambda name: symbol_dict_reversed.get(name, None)
444 )
446 df_datasets = df_datasets[
447 ["YEAR", "SYMBOL", "DATASET"]
448 + [
449 col
450 for col in df_datasets.columns
451 if col not in ["YEAR", "SYMBOL", "DATASET"]
452 ]
453 ]
455 return df_datasets.sort_values(["YEAR", "DATASET"]).reset_index(drop=True)
457 def all_data_sets(self, *, year: Optional[int] = None) -> pd.DataFrame:
458 """
459 Retrieve a description of available data sets.
461 Parameters
462 ----------
463 year
464 The year to query. If not provided, all data sets for all
465 years are queried.
467 Returns
468 -------
469 A data frame describing the data sets that are available.
470 """
471 if year is not None:
472 return self._data_sets_for_year(year)
474 return self._all_data_sets()
476 @staticmethod
477 def _compile_pattern(pattern, case):
478 """Compile patterns passed to search methods with case flag."""
479 if isinstance(pattern, str):
480 flags = re.IGNORECASE if not case else 0
481 pattern = re.compile(pattern, flags=flags)
482 return pattern
484 def search_data_sets(
485 self,
486 *,
487 vintage: Optional[Union[int, Iterable[int]]] = None,
488 pattern: Optional[Union[str, re.Pattern]] = None,
489 case: bool = False,
490 ) -> pd.DataFrame:
491 """
492 Search for data sets over one or more vintages.
494 Parameters
495 ----------
496 vintage
497 One or more Vintages to explore.
498 pattern
499 A regular expression to match against the name and description of a variable. This
500 is used to filter down results. Normally at most one of `name` and `re` will be used.
501 case:
502 If `patters` is not `None` then indicates whether the regular expression match is
503 case sensitive. Does not affect the `name` match.
505 Returns
506 -------
507 A data frame of matching variables.
508 """
509 if vintage is None or isinstance(vintage, int):
510 vintage = [vintage]
512 df_datasets = pd.concat(
513 (self.all_data_sets(year=year) for year in vintage), ignore_index=True
514 )
516 if pattern is not None:
517 pattern = self._compile_pattern(pattern, case)
519 df_matches = df_datasets[
520 df_datasets["SYMBOL"].str.contains(pattern)
521 | df_datasets["DATASET"].str.contains(pattern)
522 | df_datasets["TITLE"].str.contains(pattern)
523 | df_datasets["DESCRIPTION"].str.contains(pattern)
524 ]
525 else:
526 df_matches = df_datasets
528 return df_matches.reset_index(drop=True)
530 def all_groups(
531 self,
532 dataset: str,
533 year: int,
534 ) -> pd.DataFrame:
535 """
536 Get descriptions of all the groups in the data set.
538 Parameters
539 ----------
540 dataset
541 The data set.
542 year
543 The year.
545 Returns
546 -------
547 Metadata on all the groups in the data set.
548 """
549 groups = self._variable_source.get_all_groups(dataset, year)
551 # Some data sets have no groups.
552 if len(groups["groups"]) == 0:
553 return pd.DataFrame(columns=["DATASET", "YEAR", "GROUP", "DESCRIPTION"])
555 return (
556 pd.DataFrame(
557 [
558 {
559 "DATASET": dataset,
560 "YEAR": year,
561 "GROUP": group["name"],
562 "DESCRIPTION": group["description"],
563 }
564 for group in groups["groups"]
565 ]
566 )
567 .sort_values(["DATASET", "YEAR", "GROUP"])
568 .reset_index(drop=True)
569 )
571 def search_groups(
572 self,
573 dataset: str,
574 vintage: Union[int, Iterable[int]],
575 *,
576 pattern: Optional[Union[str, re.Pattern]] = None,
577 case: bool = False,
578 ) -> pd.DataFrame:
579 """
580 Search for groups in a data set over one or more vintages.
582 Parameters
583 ----------
584 dataset
585 The data set.
586 vintage
587 One or more Vintages to explore.
588 pattern
589 A regular expression to match against the name and description of a variable. This
590 is used to filter down results. Normally at most one of `name` and `re` will be used.
591 case:
592 If `patters` is not `None` then indicates whether the regular expression match is
593 case sensitive. Does not affect the `name` match.
595 Returns
596 -------
597 A data frame of matching variables.
598 """
599 if vintage is None or isinstance(vintage, int):
600 vintage = [vintage]
602 def _all_groups_eat_404(year: int):
603 """
604 Skip bad year and return no results.
606 We assume it is a bad year if we get a 404.
607 """
608 try:
609 return self.all_groups(dataset, year)
610 except CensusApiException as e:
611 if "404" in str(e):
612 return pd.DataFrame()
613 else:
614 raise e
616 df_groups = pd.concat(
617 (_all_groups_eat_404(year) for year in vintage), ignore_index=True
618 )
620 if pattern is not None:
621 pattern = self._compile_pattern(pattern, case)
623 df_matches = df_groups[
624 df_groups["GROUP"].str.contains(pattern)
625 | df_groups["DESCRIPTION"].str.contains(pattern)
626 ]
627 else:
628 df_matches = df_groups
630 return df_matches.reset_index(drop=True)
632 def all_variables(
633 self,
634 dataset: str,
635 year: int,
636 group_name: Optional[str],
637 *,
638 skip_annotations: bool = True,
639 skip_subgroup_variables: bool = True,
640 ) -> pd.DataFrame:
641 """
642 Produce a data frame of metadata on all variables in a group.
644 Parameters
645 ----------
646 dataset
647 The data set.
648 year
649 The year.
650 group_name
651 The group.
652 skip_annotations
653 If `True` try to filter out variables that are
654 annotations rather than actual values, by skipping
655 those with labels that begin with "Annotation" or
656 "Margin of Error".
657 skip_subgroup_variables
658 If this is `True`, then we will ignore variables from alphabetical
659 subgroups. These are relatively common in the ACS, where there are
660 groups like `B01001` that have subgroups `B01001A`, `B01001B` and
661 so on. The underlying census API sometimes reports variables like
662 `B01001A_001E` from these as members of `B01001` and other times as
663 members of `B01001A`. Setting this `True`, which is the default,
664 does not report `B01001A_001E` when the group name `name='B01001'`.
666 Returns
667 -------
668 Metadata on all variables in the group.
669 """
670 group_variables = self.group_variables(
671 dataset,
672 year,
673 group_name,
674 skip_annotations=skip_annotations,
675 skip_subgroup_variables=skip_subgroup_variables,
676 )
678 def variable_items(variable_dict: Dict) -> Optional[Dict[str, str]]:
679 if "values" in variable_dict:
680 values = variable_dict["values"]
681 return values.get("item", np.nan)
683 return None
685 return pd.DataFrame(
686 [
687 {
688 "YEAR": year,
689 "DATASET": dataset,
690 "GROUP": self.get(dataset, year, variable_name).get(
691 "group", np.nan
692 ),
693 "VARIABLE": variable_name,
694 "LABEL": self.get(dataset, year, variable_name)["label"],
695 "SUGGESTED_WEIGHT": self.get(dataset, year, variable_name).get(
696 "suggested-weight", np.nan
697 ),
698 "VALUES": variable_items(self.get(dataset, year, variable_name)),
699 }
700 for variable_name in group_variables
701 ]
702 )
704 def search(
705 self,
706 dataset: str,
707 vintage: Union[int, Iterable[int]],
708 *,
709 group_name: Optional[str] = None,
710 name: Optional[Union[str, Iterable[str]]] = None,
711 pattern: Optional[Union[str, re.Pattern]] = None,
712 case: bool = False,
713 skip_annotations: bool = True,
714 skip_subgroup_variables: bool = True,
715 ) -> pd.DataFrame:
716 """
717 Search for variables in a data set over one or more vintages.
719 Parameters
720 ----------
721 dataset
722 The data set.
723 vintage
724 One or more Vintages to explore.
725 group_name
726 The group if we should explore only a single group. If `None` all groups
727 will be explored.
728 name
729 The name of one of more variables to explore. If `None` all variables are considered.
730 Normally at most one of `name` and `re` will be used.
731 pattern
732 A regular expression to match against the name and description of a variable. This
733 is used to filter down results. Normally at most one of `name` and `re` will be used.
734 case:
735 If `patters` is not `None` then indicates whether the regular expression match is
736 case sensitive. Does not affect the `name` match.
737 skip_annotations
738 If `True` try to filter out variables that are
739 annotations rather than actual values, by skipping
740 those with labels that begin with "Annotation" or
741 "Margin of Error".
742 skip_subgroup_variables
743 If this is `True`, then we will ignore variables from alphabetical
744 subgroups. These are relatively common in the ACS, where there are
745 groups like `B01001` that have subgroups `B01001A`, `B01001B` and
746 so on. The underlying census API sometimes reports variables like
747 `B01001A_001E` from these as members of `B01001` and other times as
748 members of `B01001A`. Setting this `True`, which is the default,
749 does not report `B01001A_001E` when the group name `name='B01001'`.
751 Returns
752 -------
753 A data frame of matching variables.
754 """
755 if isinstance(vintage, int):
756 vintage = [vintage]
758 def _all_variables_eat_404(year: int):
759 """
760 Skip bad year and return no results.
762 We assume it is a bad year if we get a 404.
763 """
764 try:
765 return self.all_variables(
766 dataset,
767 year,
768 group_name,
769 skip_annotations=skip_annotations,
770 skip_subgroup_variables=skip_subgroup_variables,
771 )
772 except CensusApiException as e:
773 if "404" in str(e):
774 return pd.DataFrame()
775 else:
776 raise e
778 df_all_variables = pd.concat(
779 (_all_variables_eat_404(year) for year in vintage), ignore_index=True
780 )
782 # If we were given names to match on, then match on them.
783 if name is not None:
784 if isinstance(name, str):
785 name = [name]
787 df_name_matches = pd.concat(
788 [
789 df_all_variables[df_all_variables["VARIABLE"] == var_name]
790 for var_name in name
791 ],
792 axis="rows",
793 )
794 else:
795 df_name_matches = df_all_variables
797 if pattern is not None:
798 pattern = self._compile_pattern(pattern, case)
800 df_matches = df_name_matches[
801 (
802 df_name_matches["VARIABLE"].str.contains(pattern)
803 | df_name_matches["LABEL"].str.contains(pattern)
804 )
805 ]
806 else:
807 df_matches = df_name_matches
809 return pd.DataFrame(df_matches).reset_index(drop=True)
811 def group_tree(
812 self,
813 dataset: str,
814 year: int,
815 group_name: Optional[str],
816 *,
817 skip_annotations: bool = True,
818 ) -> "VariableCache.GroupTreeNode":
819 """
820 Construct a tree that embodies the parent/child relationships of all the variables in a group.
822 Parameters
823 ----------
824 dataset
825 The data set.
826 year
827 The year.
828 group_name
829 The group.
830 skip_annotations
831 If `True`, skip variables that are annotations of others, like
832 margin of error.
834 Returns
835 -------
836 A tree that can be printed or walked.
837 """
838 group = self.get_group(dataset, year, group_name)
840 root = VariableCache.GroupTreeNode()
842 for variable_name, details in group.items():
843 path = details["label"].split("!!")
845 if skip_annotations and (
846 path[0].startswith("Annotation")
847 or path[0].startswith("Margin of Error")
848 or path[0].startswith("Statistical Significance")
849 ):
850 continue
852 node = root
854 # Construct a nested path of nodes down to the
855 # leaf.
856 for component in path:
857 child = node.get(component, None)
858 if child is None:
859 child = VariableCache.GroupTreeNode()
860 node.add_child(component, child)
861 node = child
863 # Put the variable name at the lead.
864 node.name = variable_name
866 return root
868 def group_leaves(
869 self, dataset: str, year: int, name: str, *, skip_annotations: bool = True
870 ) -> List[str]:
871 """
872 Find the leaves of a given group.
874 Parameters
875 ----------
876 dataset
877 The census dataset.
878 year
879 The year
880 name
881 The name of the group.
882 skip_annotations
883 If `True` try to filter out variables that are
884 annotations rather than actual values, by skipping
885 those with labels that begin with "Annotation" or
886 "Margin of Error".
888 Returns
889 -------
890 A list of the variables in the group that are leaves,
891 i.e. they are not aggregates of other variables. For example,
892 in the group `B03002` in the `acs/acs5` dataset in the
893 year `2020`, the variable `B03002_003E` is a leaf, because
894 it represents
895 "Estimate!!Total:!!Not Hispanic or Latino:!!White alone",
896 whereas B03002_002E is not a leaf because it represents
897 "Estimate!!Total:!!Not Hispanic or Latino:", which is a total
898 that includes B03002_003E as well as others like "B03002_004E",
899 "B03002_005E" and more.
901 The typical reason we want leaves is because that gives us a set
902 of variables representing counts that do not overlap and add up
903 to the total. We can use these directly in diversity and integration
904 calculations using the `divintseg` package.
905 """
906 tree = self.group_tree(dataset, year, name)
908 leaves = tree.leaf_variables()
910 if skip_annotations:
911 group = self.get_group(dataset, year, name)
912 leaves = (
913 leaf
914 for leaf in leaves
915 if (not group[leaf]["label"].startswith("Annotation"))
916 and (not group[leaf]["label"].startswith("Margin of Error"))
917 )
919 return sorted(leaves)
921 def group_variables(
922 self,
923 dataset: str,
924 year: int,
925 group_name: str,
926 *,
927 skip_annotations: bool = True,
928 skip_subgroup_variables: bool = True,
929 ) -> List[str]:
930 """
931 Find the variables of a given group.
933 Parameters
934 ----------
935 dataset
936 The census dataset.
937 year
938 The year
939 group_name
940 The name of the group.
941 skip_annotations
942 If `True` try to filter out variables that are
943 annotations rather than actual values, by skipping
944 those with labels that begin with "Annotation" or
945 "Margin of Error".
946 skip_subgroup_variables
947 If this is `True`, then we will ignore variables from alphabetical
948 subgroups. These are relatively common in the ACS, where there are
949 groups like `B01001` that have subgroups `B01001A`, `B01001B` and
950 so on. The underlying census API sometimes reports variables like
951 `B01001A_001E` from these as members of `B01001` and other times as
952 members of `B01001A`. Setting this `True`, which is the default,
953 does not report `B01001A_001E` when the group name `name='B01001'`.
955 Returns
956 -------
957 A list of the variables in the group.
958 """
959 tree = self.get_group(
960 dataset, year, group_name, skip_subgroup_variables=skip_subgroup_variables
961 )
963 if skip_annotations:
964 group_variables = [
965 k
966 for k, v in tree.items()
967 if (not v["label"].startswith("Annotation"))
968 and (not v["label"].startswith("Margin of Error"))
969 ]
970 else:
971 group_variables = list(tree.keys())
973 return sorted(group_variables)
975 def __contains__(self, item: Tuple[str, int, str]) -> bool:
976 """Magic method behind the `in` operator."""
977 source, year, name = item
979 return name in self._variable_cache[source][year]
981 def __getitem__(self, item: Tuple[str, int, str]):
982 """Magic method behind the `[]` operator."""
983 return self.get(*item)
985 def __len__(self):
986 """Return he number of elements in the cache."""
987 return sum(
988 len(names)
989 for years in self._variable_cache.values()
990 for names in years.values()
991 )
993 def keys(self) -> Iterable[Tuple[str, int, str]]:
994 """Keys, i.e. the names of variables, in the cache."""
995 for key, _ in self.items():
996 yield key
998 def __iter__(self) -> Iterable[Tuple[str, int, str]]:
999 """Return an iterator over the keys."""
1000 return self.keys()
1002 def values(self) -> Iterable[dict]:
1003 """Values, i.e. the descriptions of variables, in the cache."""
1004 for _, value in self.items():
1005 yield value
1007 def items(self) -> Iterable[Tuple[Tuple[str, int, str], dict]]:
1008 """Items in the mapping from variable name to descpription."""
1009 for source, values_for_source in self._variable_cache.items():
1010 for year, values_for_year in values_for_source.items():
1011 for name, value in values_for_year.items():
1012 yield (source, year, name), value
1014 def invalidate(self, dataset: str, year: int, name: str):
1015 """Remove an item from the cache."""
1016 if self._variable_cache[dataset][year].pop(name, None):
1017 if len(self._variable_cache[dataset][year]) == 0:
1018 self._variable_cache[dataset].pop(year)
1019 if len(self._variable_cache[dataset]) == 0:
1020 self._variable_cache.pop(dataset)
1022 def clear(self):
1023 """
1024 Clear the entire cache.
1026 This just means that further calls to :py:meth:`~get` will
1027 have to make a call to the source behind the cache.
1028 """
1029 self._variable_cache = defaultdict(lambda: defaultdict(dict))