Skip to content

Commit 1832e35

Browse files
committed
add catalogue filtering
1 parent 471517e commit 1832e35

File tree

7 files changed

+1796
-321
lines changed

7 files changed

+1796
-321
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ RdMacros:
3232
Encoding: UTF-8
3333
Language: en
3434
Roxygen: list(markdown = TRUE)
35-
RoxygenNote: 7.3.1
35+
RoxygenNote: 7.3.2

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export("%>%")
4+
export(czso_filter_catalogue)
45
export(czso_get_catalogue)
56
export(czso_get_codelist)
67
export(czso_get_dataset_doc)

NEWS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# czso (development version)
22

3+
* add new function `czso_filter_catalogue()` which provides an ergonomic search of the catalogue, searching relevant fields of the catalogue for a union of the search terms.
4+
* related to above, `czso_get_catalogue()` has a new `search_terms` parameter, which filters the catalogue inline.
5+
36
# czso 0.4.0
47

58
* move to new CZSO API

R/core.R

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
#' Pass the string in the `dataset_id` column to `get_czso_table()`. `dataset_iri`
77
#' is the unique identifier of the dataset in the national catalogue and also the URL
88
#' containing all metadata for the dataset.
9-
#'
9+
#' @param search_terms a regex pattern, or a vector of regex patterns, to filter the catalogue by.
10+
#' A case-insensitive filter is performed on the title, description and keywords.
11+
#' The search returns only catalogue entries where all the patterns are matched anywhere within the title, description or keywords.
1012
#' @return a data frame with details on all CZSO datasets available in the Czech National Open Data Catalogue.
1113
#' The columns are fairly well described by their names, except:
1214
#'
@@ -23,13 +25,14 @@
2325
#' @examples
2426
#' \donttest{
2527
#' czso_get_catalogue()
28+
#' czso_get_catalogue(search_terms = c("kraj", "me?zd"))
2629
#' }
27-
czso_get_catalogue <- function() {
30+
czso_get_catalogue <- function(search_terms = NULL) {
2831
url <- "https://vdb.czso.cz/pll/eweb/lkod_ld.seznam"
2932

3033
if(is_above_bigsur()) stop_on_openssl()
3134

32-
suppressWarnings(readr::read_csv(url,
35+
ctlg <- suppressWarnings(readr::read_csv(url,
3336
col_types = readr::cols(
3437
dataset_iri = readr::col_character(),
3538
dataset_id = readr::col_character(),
@@ -46,6 +49,48 @@ czso_get_catalogue <- function() {
4649
))) %>%
4750
dplyr::mutate(periodicity = dplyr::recode(.data$periodicity, nikdy = "NEVER"))
4851

52+
if(!is.null(search_terms)) {
53+
czso_filter_catalogue(ctlg, search_terms)
54+
} else {
55+
ctlg
56+
}
57+
58+
}
59+
60+
#' Filter the catalogue using a set of keywords
61+
#'
62+
#' @param catalogue a catalogue as returned by `czso_get_catalogue()`
63+
#' @param search_terms #' A regex pattern (incl. plain text), or a vector of regex patterns, to filter the catalogue by.
64+
#' A case-insensitive filter is performed on the title, description and keywords.
65+
#' The search returns only catalogue entries where all the patterns are matched anywhere within the title, description or keywords.
66+
#'
67+
#' @return A tibble with the filtered catalogue.
68+
#' @export
69+
#'
70+
#' @examples
71+
#' ctlg <- czso_get_catalogue()
72+
#' czso_filter_catalogue(ctlg, search_terms = c("kraj", "me?zd"))
73+
#' czso_filter_catalogue(ctlg, search_terms = c("úmrt", "orp"))
74+
#' czso_filter_catalogue(ctlg, search_terms = c("kraj", "vazba", "orp"))
75+
#' czso_filter_catalogue(ctlg, search_terms = c("ISCO", "číselník"))
76+
#' czso_filter_catalogue(ctlg, search_terms = c("zaměstnání", "číselník"))
77+
czso_filter_catalogue <- function(catalogue, search_terms) {
78+
# Initialize an empty vector to store IDs of the relevant catalogue entries
79+
relevant_ids <- c()
80+
81+
# Iterate over each row in the input data frame
82+
for (i in 1:nrow(catalogue)) {
83+
row <- catalogue[i, c("dataset_id", "title", "description", "keywords_all")]
84+
# Check if any of the patterns match in any of the three text columns
85+
if (all(sapply(search_terms, function(pattern) any(grepl(pattern, row,
86+
ignore.case = TRUE))))){
87+
# Append the row to the filtered data frame
88+
relevant_ids <- c(relevant_ids, row[["dataset_id"]])
89+
}
90+
}
91+
filtered_catalogue <- catalogue[catalogue$dataset_id %in% relevant_ids, ]
92+
93+
filtered_catalogue
4994
}
5095

5196

man/czso_filter_catalogue.Rd

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/czso_get_catalogue.Rd

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)