##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (bulk_directory = here::here("data", "htm"), skip_parsed = T,
dates = "all")
{
cr_file <- list.files(bulk_directory)
cr <- tibble(file = cr_file, year = str_extract(cr_file,
"[0-9]{4}") %>% as.numeric(), date = str_extract(cr_file,
"[0-9]{4}-[0-9]{2}-[0-9]{2}") %>% as.Date())
cr %<>% arrange(date) %>% arrange(rev(date))
if (as.character(dates) == "all") {
dates <- cr %>% filter(year < 2021) %>% pull(date) %>%
unique()
}
if (skip_parsed == T) {
cr_parsed <- list.files(bulk_directory %>% str_replace("/htm",
"/txt"), recursive = T)
length(cr_parsed)
cr_parsed %<>% str_extract("[0-9]{4}-[0-9]{2}-[0-9]{2}") %<>%
unique() %>% as.Date()
length(cr_parsed)
cr %<>% filter(!date %in% cr_parsed)
dim(cr)
}
cr %<>% mutate(congress = as.numeric(round((year - 2001.1)/2)) +
107)
cr %<>% mutate(chamber = str_extract(file, "Pg.") %>% str_remove("Pg") %>%
str_replace("E", "Extensions of Remarks") %>% str_replace("H",
"House") %>% str_replace("S", "Senate"))
cr %<>% mutate(url_txt = str_c("https://www.congress.gov/",
congress, "/crec/", date %>% str_replace_all("-", "/"),
"/modified/", file))
walk(.x = unique(dates), .f = write_cr)
}
#> function (bulk_directory = here::here("data", "htm"), skip_parsed = T,
#> dates = "all")
#> {
#> cr_file <- list.files(bulk_directory)
#> cr <- tibble(file = cr_file, year = str_extract(cr_file,
#> "[0-9]{4}") %>% as.numeric(), date = str_extract(cr_file,
#> "[0-9]{4}-[0-9]{2}-[0-9]{2}") %>% as.Date())
#> cr %<>% arrange(date) %>% arrange(rev(date))
#> if (as.character(dates) == "all") {
#> dates <- cr %>% filter(year < 2021) %>% pull(date) %>%
#> unique()
#> }
#> if (skip_parsed == T) {
#> cr_parsed <- list.files(bulk_directory %>% str_replace("/htm",
#> "/txt"), recursive = T)
#> length(cr_parsed)
#> cr_parsed %<>% str_extract("[0-9]{4}-[0-9]{2}-[0-9]{2}") %<>%
#> unique() %>% as.Date()
#> length(cr_parsed)
#> cr %<>% filter(!date %in% cr_parsed)
#> dim(cr)
#> }
#> cr %<>% mutate(congress = as.numeric(round((year - 2001.1)/2)) +
#> 107)
#> cr %<>% mutate(chamber = str_extract(file, "Pg.") %>% str_remove("Pg") %>%
#> str_replace("E", "Extensions of Remarks") %>% str_replace("H",
#> "House") %>% str_replace("S", "Senate"))
#> cr %<>% mutate(url_txt = str_c("https://www.congress.gov/",
#> congress, "/crec/", date %>% str_replace_all("-", "/"),
#> "/modified/", file))
#> walk(.x = unique(dates), .f = write_cr)
#> }
#> <environment: 0x1388e9f20>