This example uses the tidyverse
packages stringr
, purrr
, as well as here
and magrittr
.
Say we have a bunch of pdfs that look like this. This one has over 200 pages, so this would be a lot to re-arange by hand.
Let us start by cleaning one file
text <- pdf_text(here("data/Congressional logs Dec 2017-May 2019.pdf"))
text[1:2]
## [1] " U.S. Department of Energy\n The Electronic Document Online Correspondence\n and Collaboration System (eDOCS)\n Folder List Report\n 1. Title: Letter to Secretary Perry (via email) from Senator Sheldon Whitehouse\n Subject: DOE proposal to FERC\n Control: EXEC-2017-009153\n Assigned To: CI / Ted Garrish Created Date: 12/08/2017\n Signature Level: Rick Perry Correspondence Date: 12/05/2017\n Addressee Office: Rick Perry Received Date: 12/08/2017\n Action Requested: Prepare Response Completed Date: 12/13/2017\n Priority: CAP2 Due Date: 12/22/2017\n Point of Contact: Status: completed\n Program Contact: Source: CC\nReport run at: 1/10/2018 3:28:07PM Page: 1 of 9\n"
## [2] " U.S. Department of Energy\n The Electronic Document Online Correspondence\n and Collaboration System (eDOCS)\n Folder List Report\n 1. Title: Letter to Secretary Rick Perry from Representatives Robert Aderholt, Walter Jones, and Jeff Duncan\n Subject: Wants the Secretary to include the development of controlled Low Energy Nuclear Reactors (LENR) as part of\n the DOE funded renewable energies list\n Control: EXEC-2017-009163\n Assigned To: EE / Daniel Simmons Created Date: 12/11/2017\n Signature Level: Rick Perry Correspondence Date: 12/07/2017\n Addressee Office: Rick Perry Received Date: 12/11/2017\n Action Requested: Prepare Response Completed Date: [no date provided]\n Priority: CAP2 Due Date: 01/02/2018\n Point of Contact: Johnsen, Steven (MA) Status: open\n Program Contact: Source: CC\nReport run at: 1/10/2018 3:28:07PM Page: 2 of 9\n"
# replace all line breaks ("\n") with a space
text %<>% str_replace_all("\n", " ")
# replace one or more spaces with a single space using the regular expression "+" for a pattern repeating one or more times
text %<>% str_replace_all(" +", " ")
# remove white space from the beginning and end of the string
text %<>% str_trim()
# str_squish is equivelent to both str_trim()/trimws() and str_replace_all(" +", " ")
text %<>% str_squish()
text[1:2]
## [1] "U.S. Department of Energy The Electronic Document Online Correspondence and Collaboration System (eDOCS) Folder List Report 1. Title: Letter to Secretary Perry (via email) from Senator Sheldon Whitehouse Subject: DOE proposal to FERC Control: EXEC-2017-009153 Assigned To: CI / Ted Garrish Created Date: 12/08/2017 Signature Level: Rick Perry Correspondence Date: 12/05/2017 Addressee Office: Rick Perry Received Date: 12/08/2017 Action Requested: Prepare Response Completed Date: 12/13/2017 Priority: CAP2 Due Date: 12/22/2017 Point of Contact: Status: completed Program Contact: Source: CC Report run at: 1/10/2018 3:28:07PM Page: 1 of 9"
## [2] "U.S. Department of Energy The Electronic Document Online Correspondence and Collaboration System (eDOCS) Folder List Report 1. Title: Letter to Secretary Rick Perry from Representatives Robert Aderholt, Walter Jones, and Jeff Duncan Subject: Wants the Secretary to include the development of controlled Low Energy Nuclear Reactors (LENR) as part of the DOE funded renewable energies list Control: EXEC-2017-009163 Assigned To: EE / Daniel Simmons Created Date: 12/11/2017 Signature Level: Rick Perry Correspondence Date: 12/07/2017 Addressee Office: Rick Perry Received Date: 12/11/2017 Action Requested: Prepare Response Completed Date: [no date provided] Priority: CAP2 Due Date: 01/02/2018 Point of Contact: Johnsen, Steven (MA) Status: open Program Contact: Source: CC Report run at: 1/10/2018 3:28:07PM Page: 2 of 9"
# make it a tibble (a tidy data frame)
d <- tibble(text = text)
d[1:2, ]
## # A tibble: 2 x 1
## text
## <chr>
## 1 U.S. Department of Energy The Electronic Document Online Correspondence …
## 2 U.S. Department of Energy The Electronic Document Online Correspondence …
# split into columns by a Regex pattern "sep = "[regex]"
d %<>%
separate(text, into = c("header", "text"), sep = "Title:") %>%
# drop header
select(-header)
d
## # A tibble: 207 x 1
## text
## <chr>
## 1 " Letter to Secretary Perry (via email) from Senator Sheldon Whitehouse…
## 2 " Letter to Secretary Rick Perry from Representatives Robert Aderholt, …
## 3 " Letter to Secretary Rick Perry from Senator John Hoeven Subject: Stro…
## 4 " Letter to Secretary Perry from Reps. Trey Gowdy, Chairman, Committee …
## 5 " Letter to Secretary Rick Perry from Senator Michael F. Bennet, Senato…
## 6 " Joint letter to Secretary Rick Perry and Mick Mulvaney, Director, Off…
## 7 " Letter to Secretary Rick Perry from Representative Fred Upton, Chairm…
## 8 " Joint letter to Secretary Rick Perry and Mick Mulvaney, Director, Off…
## 9 " Invitation to Secretary Rick Perry from Senator Michael Enzi and Repr…
## 10 " Letter to Secretary Rick Perry from Senator Tammy Baldwin Subject: Wr…
## # … with 197 more rows
# split out additional columns
d %<>%
tidyr::separate(text,
into = c("Title", "Subject", "Control", "Assigned","Created", "Signature", "DATE", "Addressee", "Received", "Action", "Completed", "Priority", "Due", "Point of Contact", "Status"),
sep = c("Subject:|Control:|Assigned To:|Created Date:|Signature Level:|Correspondence Date:|Addressee Office:|Received Date:|Action Requested:|Completed Date:|Priority:|Due Date:|Point of Contact:|Status:"), # RegEx pattern of separators
extra = "merge") # past any extra text at the end rather than dropping it
d %>% select(Title, Subject, DATE)
## # A tibble: 207 x 3
## Title Subject DATE
## <chr> <chr> <chr>
## 1 " Letter to Secretary Perry (v… " DOE proposal to FERC " " 12/05…
## 2 " Letter to Secretary Rick Per… " Wants the Secretary to inclu… " 12/07…
## 3 " Letter to Secretary Rick Per… " Strongly urges DOE to consid… " 12/15…
## 4 " Letter to Secretary Perry fr… " Submits questions regarding … " 12/08…
## 5 " Letter to Secretary Rick Per… " Seeks DOE assistance in esta… " 12/12…
## 6 " Joint letter to Secretary Ri… " Writes to request DOE's cont… " 12/07…
## 7 " Letter to Secretary Rick Per… " Submits questions regarding … " 12/04…
## 8 " Joint letter to Secretary Ri… " Writes to request DOE's cont… " 12/07…
## 9 " Invitation to Secretary Rick… " Invites Secretary Perry to s… " 12/04…
## 10 " Letter to Secretary Rick Per… " Writes in support of the Wis… " 01/05…
## # … with 197 more rows
Trim white space at the beginning and end of each cell with str_squish
. Apply to all columns with mutate_all()
.
d %<>% mutate_all(str_squish)
d %>% select(Title, Subject, DATE)
## # A tibble: 207 x 3
## Title Subject DATE
## <chr> <chr> <chr>
## 1 Letter to Secretary Perry (via… DOE proposal to FERC 12/05…
## 2 Letter to Secretary Rick Perry… Wants the Secretary to include t… 12/07…
## 3 Letter to Secretary Rick Perry… "Strongly urges DOE to consider … 12/15…
## 4 Letter to Secretary Perry from… Submits questions regarding guid… 12/08…
## 5 Letter to Secretary Rick Perry… Seeks DOE assistance in establis… 12/12…
## 6 Joint letter to Secretary Rick… Writes to request DOE's continue… 12/07…
## 7 Letter to Secretary Rick Perry… Submits questions regarding Part… 12/04…
## 8 Joint letter to Secretary Rick… Writes to request DOE's continue… 12/07…
## 9 Invitation to Secretary Rick P… Invites Secretary Perry to speak… 12/04…
## 10 Letter to Secretary Rick Perry… Writes in support of the Wiscons… 01/05…
## # … with 197 more rows
Save as .csv
d %<>% write_csv(path = "DOE Dec 2017-May 2019.csv")
Turn the code we used to clean one file above in a function:
# the new function, pdf_to_dataframe(), takes one argument, a file name
pdf_to_dataframe <- function(file){
text <- pdf_text(here(file)) # the only change is that I replaced the file string with a *variable* called file.
# replace all line breaks ("\n") with a space
text %<>% str_replace_all("\n", " ")
# make it a tibble (a tidy data frame)
d <- tibble(text = text)
# split into columns by a Regex pattern "sep = "[regex]"
d %<>%
separate(text, into = c("header", "text"), sep = "Title:") %>%
# drop header
select(-header)
# split out additional columns
d %<>%
tidyr::separate(text,
into = c("Title", "Subject", "Control", "Assigned","Created", "Signature", "DATE", "Addressee", "Received", "Action", "Completed", "Priority", "Due", "Point of Contact", "Status"),
sep = c("Subject:|Control:|Assigned To:|Created Date:|Signature Level:|Correspondence Date:|Addressee Office:|Received Date:|Action Requested:|Completed Date:|Priority:|Due Date:|Point of Contact:|Status:"), # RegEx pattern of separators
extra = "merge") # past any extra text at the end rather than dropping it
d %<>% mutate_all(str_squish)
return(d) # return the output of the function
}
Let us test out our new function
pdf_to_dataframe("data/Congressional logs Dec 2017-May 2019.pdf")
## # A tibble: 207 x 15
## Title Subject Control Assigned Created Signature DATE Addressee
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Lett… DOE pr… EXEC-2… CI / Te… 12/08/… Rick Per… 12/0… Rick Per…
## 2 Lett… Wants … EXEC-2… EE / Da… 12/11/… Rick Per… 12/0… Rick Per…
## 3 Lett… "Stron… EXEC-2… FE / St… 12/18/… FE 12/1… Rick Per…
## 4 Lett… Submit… EXEC-2… GC / Jo… 12/11/… GC 12/0… Rick Per…
## 5 Lett… Seeks … EXEC-2… LM / Ca… 12/13/… Rick Per… 12/1… Rick Per…
## 6 Join… Writes… EXEC-2… NE / Ed… 12/08/… Rick Per… 12/0… Rick Per…
## 7 Lett… Submit… EXEC-2… NNSA / … 12/04/… Rick Per… 12/0… Rick Per…
## 8 Join… Writes… EXEC-2… SC / St… 12/08/… Rick Per… 12/0… Rick Per…
## 9 Invi… Invite… EXEC-2… SL 12/19/… N/A 12/0… Rick Per…
## 10 Lett… Writes… EXEC-2… EE / Da… 01/05/… EE 01/0… Rick Per…
## # … with 197 more rows, and 7 more variables: Received <chr>,
## # Action <chr>, Completed <chr>, Priority <chr>, Due <chr>, `Point of
## # Contact` <chr>, Status <chr>
map_dfr
from purrr
files <- c("data/Congressional logs Dec 2017-May 2019.pdf")
# apply (map) our function onto each of our files and make a data frame (df) by combining them by row (r)
d <- map_dfr(files, pdf_to_dataframe)
d %<>% write_csv(path = "DOE_combined.csv")