# a function to read speaker data
get_speakers <- function(file){
message(file)
read_delim(str_c(here("hein-bound/", file) %>% str_remove("/cr") ),
delim = "|") %>%
# drop speech data for now
select(-speech_id) %>%
# extract congress from file name
mutate(congress = str_remove(file, "_.*") ) %>%
distinct()
}
# files
heinbound <- list.files(here("hein-bound") %>% str_remove("/cr") ) %>%
enframe(value = "file")
# subset to speakermap files from recent congresses
speakermaps <- heinbound %>%
filter(str_detect(file, "SpeakerMap"),
str_detect(file, "^1") #FIXME subsetting to >99th congress
)
# speakers dataframe
speakers <- map_dfr(speakermaps$file, get_speakers)
# match to voteview format
speakers %<>%
mutate(last_name = lastname,
first_name = str_to_sentence(firstname),
congress = congress %>% str_remove("^0") %>% as.numeric(),
chamber = chamber %>%
str_replace("S$", "Senate") %>%
str_replace("H$", "House"),
district_code = ifelse(chamber == "Senate", 0, district),
# clean up some 0s that are actually missing data
district_code = ifelse(chamber == "House" & district_code == "0", NA, district_code) %>%
as.numeric()
)
speakers %<>%
rename(state_abbrev = state)
# FIXME focusing on the 107-111th for now
speakers %<>%
filter(congress > 106)
# Devin's members data (expanded from voteview)
load(here("data", "members.Rdata"))
# members %>% select(names(members)[names(members) %in% names(speakers)])
# Make mixed case names, like "McConnell" upper case (to go the other way, we would need to fix them one by one)
members %<>%
mutate(last_name = str_to_upper(last_name),
party = str_sub(party_name, 1, 1))A few last names also differ, e.g. Mary Bono Mack also fails to match because she is just Mary Bono in Voteview.
speakers %>%
select( -district_code) %>%
anti_join(members) %>%
select(-first_name) %>%
left_join(members) %>%
select(voteview_first_name = first_name,
speakermap_firstname = firstname,
lastname, nonvoting) %>%
distinct() %>%
arrange(lastname) %>% kablebox()| voteview_first_name | speakermap_firstname | lastname | nonvoting |
|---|---|---|---|
| NA | ANIBAL | ACEVEDO-VILA | nonvoting |
| W | W. | AKIN | voting |
| NA | RODNEY | ALEXANDER | voting |
| A | WAYNE | ALLARD | voting |
| Kenneth | KEN | BENTSEN | voting |
| Jesse | JEFF | BINGAMAN | voting |
| Robert | ROB | BISHOP | voting |
| Josiah | JO | BONNER | voting |
| NA | MARY | BONO MACK | voting |
| NA | MADELEINE | BORDALLO | nonvoting |
| Daniel | DAN | BOREN | voting |
| F | ALLEN | BOYD | voting |
| Sam | SAMUEL | BROWNBACK | voting |
| Vernon | VERN | BUCHANAN | voting |
| James | JIM | BUNNING | voting |
| Danny | DAN | BURTON | voting |
| Herbert | H. | CALLAHAN | voting |
| David | DAVE | CAMP | voting |
| Chris | CHRISTOPHER | CARNEY | voting |
| A | BEN | CHANDLER | voting |
| NA | DONNA | CHRISTENSEN | nonvoting |
| William | WM. | CLAY | voting |
| Joseph | J. | CLELAND | voting |
| William | THAD | COCHRAN | voting |
| Stephen | STEVE | COHEN | voting |
| Kenneth | K. | CONAWAY | voting |
| Christopher | CHRIS | COONS | voting |
| James | JIM | COOPER | voting |
| Robert | BOB | CORKER | voting |
| Charles | C. | COX | voting |
| NA | ANDER | CRENSHAW | voting |
| Thomas | JO ANN | DAVIS | voting |
| Jo | JO ANN | DAVIS | voting |
| Jim | JAMES | DAVIS | voting |
| Geoffrey | GEOFF | DAVIS | voting |
| John | NATHAN | DEAL | voting |
| Bill | WILLIAM | DELAHUNT | voting |
| James | JIM | DEMINT | voting |
| Jo | JO ANN | EMERSON | voting |
| Bobby | BOB | ETHERIDGE | voting |
| Robert | TERRY | EVERETT | voting |
| NA | ENI | FALEOMAVAEGA | nonvoting |
| James | J. | FORBES | voting |
| NA | LUIS | FORTUNO | nonvoting |
| Al | ALAN | FRANKEN | voting |
| Robert | BOB | GOODLATTE | voting |
| Daniel | BOB | GRAHAM | voting |
| William | PHIL | GRAMM | voting |
| Samuel | SAM | GRAVES | voting |
| Raymond | GENE | GREEN | voting |
| Al | GENE | GREEN | voting |
| John | J. | HASTERT | voting |
| Richard | DOC | HASTINGS | voting |
| Robert | ROBIN | HAYES | voting |
| Thomas | TIM | HOLDEN | voting |
| Mike | MICHAEL | HONDA | voting |
| John | STEPHEN | HORN | voting |
| Timothy | TIM | HUTCHINSON | voting |
| Kathryn | KAY | HUTCHISON | voting |
| Robert | BOB | INGLIS | voting |
| Johnny | JOHN | ISAKSON | voting |
| Timothy | TIM | JOHNSON | voting |
| Hank | HENRY | JOHNSON | voting |
| Steven | STEVE | KAGEN | voting |
| Marcia | MARCY | KAPTUR | voting |
| Richard | RIC | KELLER | voting |
| Mary | MARY JO | KILROY | voting |
| Thomas | TOM | LANTOS | voting |
| Richard | RICK | LARSEN | voting |
| Thomas | TOM | LATHAM | voting |
| Charles | JERRY | LEWIS | voting |
| NA | JOSEPH | LIEBERMAN | voting |
| Dave | DAVID | LOEBSACK | voting |
| Chester | TRENT | LOTT | voting |
| Ken | KENNETH | LUCAS | voting |
| Jim | JAMES | MARSHALL | voting |
| Melquiades | MEL | MARTINEZ | voting |
| James | JIM | MATHESON | voting |
| Addison | MITCH | MCCONNELL | voting |
| James | JIM | MCDERMOTT | voting |
| Daniel | JEFF | MILLER | voting |
| Jefferson | JEFF | MILLER | voting |
| Daniel | DAN | MILLER | voting |
| Jefferson | DAN | MILLER | voting |
| Brad | R. | MILLER | voting |
| Walt | WALTER | MINNICK | voting |
| Gwendolynne | GWEN | MOORE | voting |
| Timothy | TIM | MURPHY | voting |
| Clarence | BILL | NELSON | voting |
| Earl | BEN | NELSON | voting |
| Donald | DON | NICKLES | voting |
| NA | ELEANOR | NORTON | nonvoting |
| C | C.L. | OTTER | voting |
| William | BILL | PASCRELL | voting |
| Tom | THOMAS | PERRIELLO | voting |
| NA | PEDRO | PIERLUISI | nonvoting |
| John | JACK | QUINN | voting |
| Thomas | TOM | REED | voting |
| Denny | DENNIS | REHBERG | voting |
| Robert | BOB | RILEY | voting |
| Charles | PAT | ROBERTS | voting |
| Michael | MIKE | ROSS | voting |
| Margaret | MARGE | ROUKEMA | voting |
| C | C. | RUPPERSBERGER | voting |
| Timothy | TIM | RYAN | voting |
| NA | GREGORIO | SABLAN | nonvoting |
| Kenneth | KEN | SALAZAR | voting |
| William | BILL | SALI | voting |
| Hugh | H. | SAXTON | voting |
| Charles | JOE | SCARBOROUGH | voting |
| Robert | BOB | SCHAFFER | voting |
| Frank | F. | SENSENBRENNER | voting |
| Eugene | E. | SHAW | voting |
| Don | DONALD | SHERWOOD | voting |
| Clifford | RONNIE | SHOWS | voting |
| William | BILL | SHUSTER | voting |
| Isaac | IKE | SKELTON | voting |
| Zack | ZACHARY | SPACE | voting |
| NA | ARLEN | SPECTER | voting |
| Karen | JACKIE | SPEIER | voting |
| Deborah | DEBBIE | STABENOW | voting |
| Theodore | TED | STEVENS | voting |
| Wilbert | WILLIAM | TAUZIN | voting |
| Gary | GENE | TAYLOR | voting |
| Michael | MIKE | THOMPSON | voting |
| William | MAC | THORNBERRY | voting |
| James | J. | THURMOND | voting |
| Alice | DINA | TITUS | voting |
| Nicola | NIKI | TSONGAS | voting |
| Jim | JAMES | TURNER | voting |
| Thomas | TOM | UDALL | voting |
| NA | ROBERT | UNDERWOOD | nonvoting |
| Frederick | FRED | UPTON | voting |
| Christopher | CHRIS | VAN HOLLEN | voting |
| Tim | TIMOTHY | WALZ | voting |
| Zachary | ZACH | WAMP | voting |
| Wesley | WES | WATKINS | voting |
| Julius | J.C. | WATTS | voting |
| James | JIM | WEBB | voting |
| Wayne | W. | WELDON | voting |
| Wayne | ED | WHITFIELD | voting |
| Addison | JOE | WILSON | voting |
| Charlie | CHARLES | WILSON | voting |
| Ronald | RON | WYDEN | voting |
| Donald | DON | YOUNG | voting |
| Charles | C. | YOUNG | voting |
(Hereth Sandlin, Cubin, Rehberg, Young)
# mismatches
speakers %>%
select(-first_name) %>%
anti_join(members) %>%
kablebox()| speakerid | lastname | firstname | chamber | state_abbrev | gender | party | district | nonvoting | congress | last_name | district_code |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 107115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 107 | CASTLE | NA |
| 107120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 107 | NORTON | NA |
| 107117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 107 | REHBERG | NA |
| 107113400 | UNDERWOOD | ROBERT | House | GU | M | D | 0 | nonvoting | 107 | UNDERWOOD | NA |
| 107117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 107 | CHRISTENSEN | NA |
| 107115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 107 | POMEROY | NA |
| 107118220 | SANDERS | BERNARD | House | VT | M | I | 0 | voting | 107 | SANDERS | NA |
| 107119090 | CRENSHAW | ANDER | House | FL | M | I | 4 | voting | 107 | CRENSHAW | 4 |
| 107121540 | THUNE | JOHN | House | SD | M | R | 0 | voting | 107 | THUNE | NA |
| 107113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | A | 0 | nonvoting | 107 | ACEVEDO-VILA | NA |
| 107117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 107 | FALEOMAVAEGA | NA |
| 107114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 107 | CUBIN | NA |
| 107121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 107 | YOUNG | NA |
| 108120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 108 | NORTON | NA |
| 108115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 108 | CASTLE | NA |
| 108118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 108 | BORDALLO | NA |
| 108117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 108 | FALEOMAVAEGA | NA |
| 108117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 108 | CHRISTENSEN | NA |
| 108117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 108 | REHBERG | NA |
| 108118220 | SANDERS | BERNARD | House | VT | M | I | 0 | voting | 108 | SANDERS | NA |
| 108113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | P | 0 | nonvoting | 108 | ACEVEDO-VILA | NA |
| 108115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 108 | POMEROY | NA |
| 108114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 108 | CUBIN | NA |
| 108113710 | JANKLOW | WILLIAM | House | SD | M | R | 0 | voting | 108 | JANKLOW | NA |
| 108117300 | ALEXANDER | RODNEY | House | LA | M | R | 5 | voting | 108 | ALEXANDER | 5 |
| 108121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 108 | YOUNG | NA |
| 108116660 | BONO MACK | MARY | House | CA | F | R | 45 | voting | 108 | BONO MACK | 45 |
| 109114511 | SPECTER | ARLEN | Senate | PA | M | D | NA | voting | 109 | SPECTER | 0 |
| 109117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 109 | FALEOMAVAEGA | NA |
| 109120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 109 | NORTON | NA |
| 109118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 109 | BORDALLO | NA |
| 109116660 | BONO MACK | MARY | House | CA | F | R | 45 | voting | 109 | BONO MACK | 45 |
| 109118220 | SANDERS | BERNARD | House | VT | M | I | 0 | voting | 109 | SANDERS | NA |
| 109115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 109 | POMEROY | NA |
| 109115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 109 | CASTLE | NA |
| 109117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 109 | REHBERG | NA |
| 109114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 109 | FORTUNO | NA |
| 109117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 109 | CHRISTENSEN | NA |
| 109115640 | HERSETH SANDLIN | STEPHANIE | House | SD | F | D | 0 | voting | 109 | HERSETH SANDLIN | NA |
| 109114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 109 | CUBIN | NA |
| 109121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 109 | YOUNG | NA |
| 110114511 | SPECTER | ARLEN | Senate | PA | M | D | NA | voting | 110 | SPECTER | 0 |
| 110121720 | WELCH | PETER | House | VT | M | D | 0 | voting | 110 | WELCH | NA |
| 110116471 | LIEBERMAN | JOSEPH | Senate | CT | M | I | NA | voting | 110 | LIEBERMAN | 0 |
| 110120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 110 | NORTON | NA |
| 110117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 110 | CHRISTENSEN | NA |
| 110115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 110 | CASTLE | NA |
| 110115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 110 | POMEROY | NA |
| 110116660 | BONO MACK | MARY | House | CA | F | R | 45 | voting | 110 | BONO MACK | 45 |
| 110114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 110 | FORTUNO | NA |
| 110118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 110 | BORDALLO | NA |
| 110117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 110 | FALEOMAVAEGA | NA |
| 110115640 | HERSETH SANDLIN | STEPHANIE | House | SD | F | D | 0 | voting | 110 | HERSETH SANDLIN | NA |
| 110114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 110 | CUBIN | NA |
| 110117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 110 | REHBERG | NA |
| 110121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 110 | YOUNG | NA |
| 111118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 111 | BORDALLO | NA |
| 111116471 | LIEBERMAN | JOSEPH | Senate | CT | M | I | NA | voting | 111 | LIEBERMAN | 0 |
| 111120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 111 | NORTON | NA |
| 111115640 | HERSETH SANDLIN | STEPHANIE | House | SD | F | D | 0 | voting | 111 | HERSETH SANDLIN | NA |
| 111115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 111 | CASTLE | NA |
| 111117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 111 | REHBERG | NA |
| 111117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 111 | CHRISTENSEN | NA |
| 111115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 111 | POMEROY | NA |
| 111121180 | SABLAN | GREGORIO | House | MP | M | D | 0 | nonvoting | 111 | SABLAN | NA |
| 111120320 | LUMMIS | CYNTHIA | House | WY | F | R | 0 | voting | 111 | LUMMIS | NA |
| 111121720 | WELCH | PETER | House | VT | M | D | 0 | voting | 111 | WELCH | NA |
| 111120820 | PIERLUISI | PEDRO | House | PR | M | D | 0 | nonvoting | 111 | PIERLUISI | NA |
| 111117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 111 | FALEOMAVAEGA | NA |
| 111121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 111 | YOUNG | NA |
If we drop first names and districts, we match almost all voting members, but we also over-match.
Without matching on first names, we get duplicate matches, e.g., André CARSON was appointed to fill his grandmother’s seat, Julia May CARSON:
# overmatches in speakers data
speakers %>%
count(congress, chamber, last_name, state_abbrev, party, district) %>%
filter(n>1) %>%
kablebox()| congress | chamber | last_name | state_abbrev | party | district | n |
|---|---|---|---|---|---|---|
| 110 | House | CARSON | IN | D | 7 | 2 |
# potential overmatches
members %>%
count(congress, chamber, last_name, state_abbrev, party, district_code) %>%
filter(n>1) %>%
kablebox()| congress | chamber | last_name | state_abbrev | party | district_code | n |
|---|---|---|---|---|---|---|
| 105 | House | BONO | CA | R | 44 | 2 |
| 105 | House | CAPPS | CA | D | 22 | 2 |
| 106 | Senate | CHAFEE | RI | R | 0 | 2 |
| 107 | House | SHUSTER | PA | R | 9 | 2 |
| 110 | House | CARSON | IN | D | 7 | 2 |
| 112 | House | PAYNE | NJ | D | 10 | 2 |
Without matching on first names OR districts (because of missing district data), we get even more duplicate/potential duplicate matches.
# overmatches in speakers data
speakers %>%
count(congress, chamber, last_name, state_abbrev, party) %>%
filter(n>1) %>%
kablebox()| congress | chamber | last_name | state_abbrev | party | n |
|---|---|---|---|---|---|
| 107 | House | DAVIS | VA | R | 2 |
| 107 | House | MILLER | FL | R | 2 |
| 108 | House | DAVIS | VA | R | 2 |
| 108 | House | DIAZ-BALART | FL | R | 2 |
| 108 | House | SANCHEZ | CA | D | 2 |
| 109 | House | DAVIS | VA | R | 2 |
| 109 | House | DIAZ-BALART | FL | R | 2 |
| 109 | House | GREEN | TX | D | 2 |
| 109 | House | SANCHEZ | CA | D | 2 |
| 110 | House | CARSON | IN | D | 2 |
| 110 | House | DAVIS | VA | R | 2 |
| 110 | House | DIAZ-BALART | FL | R | 2 |
| 110 | House | GREEN | TX | D | 2 |
| 110 | House | SANCHEZ | CA | D | 2 |
| 111 | House | DIAZ-BALART | FL | R | 2 |
| 111 | House | GREEN | TX | D | 2 |
| 111 | House | SANCHEZ | CA | D | 2 |
# potential overmatches
members %>%
count(congress, chamber, last_name, state_abbrev, party) %>%
filter(n>1) %>%
kablebox()| congress | chamber | last_name | state_abbrev | party | n |
|---|---|---|---|---|---|
| 105 | House | BONO | CA | R | 2 |
| 105 | House | CAPPS | CA | D | 2 |
| 106 | Senate | CHAFEE | RI | R | 2 |
| 107 | House | DAVIS | VA | R | 2 |
| 107 | House | MILLER | FL | R | 2 |
| 107 | House | SHUSTER | PA | R | 2 |
| 108 | House | DAVIS | VA | R | 2 |
| 108 | House | DIAZ-BALART | FL | R | 2 |
| 108 | House | SANCHEZ | CA | D | 2 |
| 109 | House | DAVIS | VA | R | 2 |
| 109 | House | DIAZ-BALART | FL | R | 2 |
| 109 | House | GREEN | TX | D | 2 |
| 109 | House | SANCHEZ | CA | D | 2 |
| 110 | House | CARSON | IN | D | 2 |
| 110 | House | DAVIS | VA | R | 2 |
| 110 | House | DIAZ-BALART | FL | R | 2 |
| 110 | House | GREEN | TX | D | 2 |
| 110 | House | SANCHEZ | CA | D | 2 |
| 111 | House | DIAZ-BALART | FL | R | 2 |
| 111 | House | GREEN | TX | D | 2 |
| 111 | House | SANCHEZ | CA | D | 2 |
| 112 | House | GREEN | TX | D | 2 |
| 112 | House | PAYNE | NJ | D | 2 |
| 112 | House | SANCHEZ | CA | D | 2 |
| 113 | House | GREEN | TX | D | 2 |
| 113 | House | MALONEY | NY | D | 2 |
| 113 | House | SANCHEZ | CA | D | 2 |
| 114 | House | GREEN | TX | D | 2 |
| 114 | House | MALONEY | NY | D | 2 |
| 114 | House | SANCHEZ | CA | D | 2 |
| 115 | House | GREEN | TX | D | 2 |
| 115 | House | MALONEY | NY | D | 2 |
| 115 | House | ROONEY | FL | R | 2 |
| 116 | House | MALONEY | NY | D | 2 |
Just for fun, we’ll give this a try, even though we have a custom solution. fastLink returns a match for voting members.
library(fastLink)
fl.out <- fastLink(speakers,
members,
varnames = c("chamber", "state_abbrev", "party", "congress", "last_name", "first_name", "district_code"), return.df = T, return.all = F)##
## ====================
## fastLink(): Fast Probabilistic Record Linkage
## ====================
##
## If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
## Calculating matches for each variable.
## Getting counts for parameter estimation.
## Running the EM algorithm.
## Getting the indices of estimated matches.
## Deduping the estimated matches.
## Getting the match patterns for each estimated match.
speakers %>% filter(nonvoting == "voting") %>% nrow()## [1] 2715
fl.out$dfA.match %>% nrow()## [1] 2715
fl.out$dfA.match %>% head() %>% kablebox()| speakerid | lastname | firstname | chamber | state_abbrev | gender | party | district | nonvoting | congress | last_name | first_name | district_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 982 | 108115500 | DEAL | NATHAN | House | GA | M | R | 10 | voting | 108 | DEAL | Nathan | 10 |
| 617 | 108112321 | CAMPBELL | BEN | Senate | CO | M | R | NA | voting | 108 | CAMPBELL | Ben | 0 |
| 787 | 108117630 | HALL | RALPH | House | TX | M | R | 4 | voting | 108 | HALL | Ralph | 4 |
| 848 | 108113900 | TAUZIN | WILLIAM | House | LA | M | R | 3 | voting | 108 | TAUZIN | William | 3 |
| 719 | 108121351 | SHELBY | RICHARD | Senate | AL | M | R | NA | voting | 108 | SHELBY | Richard | 0 |
| 726 | 108113101 | JEFFORDS | JAMES | Senate | VT | M | I | NA | voting | 108 | JEFFORDS | James | 0 |
fl.out$dfB.match %>% head() %>% kablebox()| chamber | congress | bioname | pattern | first_name | first_initial | common_name | middle_name | middle_initial | maiden_name | last_name | add_last_name | id | icpsr | party_code | cqlabel | state | state_abbrev | bioImgURL | seo_name | district_code | party_name | nominate.dim2 | nominate.dim1 | nominate.geo_mean_probability | party_size | first_last | first_maiden | common_last | first_middle_last | first_initial_last | common_middle_last | last | last_comma_first | last_first | first_maiden_last | common_middle_initial_last | common_maiden | commoninitial_last | first_middle_initial_last | firstinitial_middleinitial_last | last_comma_initial | last_comma_commoninitial | last_comma_common | maiden_comma_first | maiden_comma_firstinitial | chamber_last | party | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | House | 108 | DEAL, John Nathan | john deal|john nathan deal|deal|john n deal|nathan deal|nathan nathan deal|nathan n deal|deal|(^|senator |representative )dealdeal, nathan|deal, john|deal john|deal, jdeal, nrepresentative dealj n deal | John | J | Nathan | Nathan | N | NA | DEAL | NA | MH10899342 | 99342 | 200 | (GA-10) | georgia | GA | 099342.jpg | john-nathan-deal | 10 | Republican Party | 0.139 | 0.581 | 0.8970000000000000 | 231 | John DEAL | 404error | Nathan DEAL | John Nathan DEAL | DEAL | Nathan Nathan DEAL | (^|senator |representative )DEAL/td> | DEAL, John | DEAL John | 404error | Nathan N DEAL | 404error | DEAL | John N DEAL | J N DEAL | DEAL, J/td> | DEAL, N/td> | DEAL, Nathan | 404error | 404error | Representative DEAL/td> | R |
| 3 | Senate | 108 | CAMPBELL, Ben Nighthorse | ben campbell|ben nighthorse campbell|campbell|ben n campbell|benjamin campbell|benjamin nighthorse campbell|benjamin n campbell|(^|senator |representative )campbellcampbell, benjamin|campbell, ben|campbell ben|campbell, bsenator campbellb n campbell | Ben | B | Benjamin | Nighthorse | N | NA | CAMPBELL | NA | MS10895407 | 95407 | 200 | (CO) | colorado | CO | 095407.jpg | ben-nighthorse-campbell | 0 | Republican Party | 0.132 | 0.230 | 0.8220000000000000 | 51 | Ben CAMPBELL | 404error | Benjamin CAMPBELL | Ben Nighthorse CAMPBELL | CAMPBELL | Benjamin Nighthorse CAMPBELL | (^|senator |representative )CAMPBELL/td> | CAMPBELL, Ben | CAMPBELL Ben | 404error | Benjamin N CAMPBELL | 404error | CAMPBELL | Ben N CAMPBELL | B N CAMPBELL | CAMPBELL, B/td> | CAMPBELL, B/td> | CAMPBELL, Benjamin | 404error | 404error | Senator CAMPBELL/td> | R |
| 4 | House | 108 | HALL, Ralph Moody | ralph hall|ralph moody hall|hall|ralph m hall|hall|(^|senator |representative )hallhall, ralph|hall ralph|hall, rrepresentative hallr m hall | Ralph | R | NA | Moody | M | NA | HALL | NA | MH10894828 | 94828 | 200 | (TX-04) | texas | TX | 094828.jpg | ralph-moody-hall | 4 | Republican Party | 0.308 | 0.424 | 0.8149999999999999 | 231 | Ralph HALL | 404error | 404error | Ralph Moody HALL | HALL | 404error | (^|senator |representative )HALL/td> | HALL, Ralph | HALL Ralph | 404error | 404error | 404error | HALL | Ralph M HALL | R M HALL | HALL, R/td> | 404error | 404error | 404error | 404error | Representative HALL/td> | R |
| 5 | House | 108 | TAUZIN, Wilbert Joseph (Billy) | wilbert tauzin|wilbert joseph tauzin|tauzin|wilbert j tauzin|billy tauzin|billy joseph tauzin|billy j tauzin|tauzin|(^|senator |representative )tauzintauzin, billy|tauzin, wilbert|tauzin wilbert|tauzin, wtauzin, brepresentative tauzinw j tauzin | Wilbert | W | Billy | Joseph | J | NA | TAUZIN | NA | MH10894679 | 94679 | 200 | (LA-03) | louisiana | LA | 094679.jpg | wilbert-joseph-billy-tauzin | 3 | Republican Party | 0.160 | 0.341 | 0.9100000000000000 | 231 | Wilbert TAUZIN | 404error | Billy TAUZIN | Wilbert Joseph TAUZIN | TAUZIN | Billy Joseph TAUZIN | (^|senator |representative )TAUZIN/td> | TAUZIN, Wilbert | TAUZIN Wilbert | 404error | Billy J TAUZIN | 404error | TAUZIN | Wilbert J TAUZIN | W J TAUZIN | TAUZIN, W/td> | TAUZIN, B/td> | TAUZIN, Billy | 404error | 404error | Representative TAUZIN/td> | R |
| 6 | Senate | 108 | SHELBY, Richard C. | richard shelby|richard c shelby|shelby|rich shelby|rich c shelby|(^|senator |representative )shelbyshelby, rich|shelby, richard|shelby richard|shelby, rsenator shelbyr c shelby | Richard | R | Rich | C | C | NA | SHELBY | NA | MS10894659 | 94659 | 200 | (AL) | alabama | AL | 094659.jpg | richard-c-shelby | 0 | Republican Party | 0.520 | 0.429 | 0.8766256675360421 | 51 | Richard SHELBY | 404error | Rich SHELBY | Richard C SHELBY | SHELBY | Rich C SHELBY | (^|senator |representative )SHELBY/td> | SHELBY, Richard | SHELBY Richard | 404error | Rich C SHELBY | 404error | SHELBY | Richard C SHELBY | R C SHELBY | SHELBY, R/td> | SHELBY, R/td> | SHELBY, Rich | 404error | 404error | Senator SHELBY/td> | R |
| 7 | Senate | 108 | JEFFORDS, James Merrill | james jeffords|james merrill jeffords|jeffords|james m jeffords|jim jeffords|jim merrill jeffords|jim m jeffords|(^|senator |representative )jeffordsjeffords, jim|jeffords, james|jeffords james|jeffords, jsenator jeffordsj m jeffords | James | J | Jim | Merrill | M | NA | JEFFORDS | NA | MS10894240 | 94240 | 328 | (VT) | vermont | VT | 094240.jpg | james-merrill-jeffords | 0 | Independent | -0.603 | -0.277 | 0.7340000000000000 | 1 | James JEFFORDS | 404error | Jim JEFFORDS | James Merrill JEFFORDS | JEFFORDS | Jim Merrill JEFFORDS | (^|senator |representative )JEFFORDS/td> | JEFFORDS, James | JEFFORDS James | 404error | Jim M JEFFORDS | 404error | JEFFORDS | James M JEFFORDS | J M JEFFORDS | JEFFORDS, J/td> | JEFFORDS, J/td> | JEFFORDS, Jim | 404error | 404error | Senator JEFFORDS/td> | I |
fl.speakers <- fl.out$dfA.match %>%
mutate(icpsr = fl.out$dfB.match$icpsr,
bioname = fl.out$dfB.match$bioname,
party_name = fl.out$dfB.match$party_name,
district_code_voteview = fl.out$dfB.match$district_code) %>%
full_join(speakers)
# failed matches
fl.speakers %>% filter(is.na(icpsr)) %>% kablebox()| speakerid | lastname | firstname | chamber | state_abbrev | gender | party | district | nonvoting | congress | last_name | first_name | district_code | icpsr | bioname | party_name | district_code_voteview |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 107120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 107 | NORTON | Eleanor | NA | NA | NA | NA | NA |
| 107113400 | UNDERWOOD | ROBERT | House | GU | M | D | 0 | nonvoting | 107 | UNDERWOOD | Robert | NA | NA | NA | NA | NA |
| 107117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 107 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
| 107113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | A | 0 | nonvoting | 107 | ACEVEDO-VILA | Anibal | NA | NA | NA | NA | NA |
| 107117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 107 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
| 108120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 108 | NORTON | Eleanor | NA | NA | NA | NA | NA |
| 108118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 108 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
| 108117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 108 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
| 108117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 108 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
| 108113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | P | 0 | nonvoting | 108 | ACEVEDO-VILA | Anibal | NA | NA | NA | NA | NA |
| 109117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 109 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
| 109120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 109 | NORTON | Eleanor | NA | NA | NA | NA | NA |
| 109118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 109 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
| 109114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 109 | FORTUNO | Luis | NA | NA | NA | NA | NA |
| 109117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 109 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
| 110120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 110 | NORTON | Eleanor | NA | NA | NA | NA | NA |
| 110117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 110 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
| 110114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 110 | FORTUNO | Luis | NA | NA | NA | NA | NA |
| 110118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 110 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
| 110117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 110 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
| 111118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 111 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
| 111120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 111 | NORTON | Eleanor | NA | NA | NA | NA | NA |
| 111117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 111 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
| 111121180 | SABLAN | GREGORIO | House | MP | M | D | 0 | nonvoting | 111 | SABLAN | Gregorio | NA | NA | NA | NA | NA |
| 111120820 | PIERLUISI | PEDRO | House | PR | M | D | 0 | nonvoting | 111 | PIERLUISI | Pedro | NA | NA | NA | NA | NA |
| 111117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 111 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
# party mismatches
fl.speakers %>% filter(party != party_name %>% str_sub(1, 1)) %>%
select(party, party_name, everything()) %>%
kablebox()| party | party_name | speakerid | lastname | firstname | chamber | state_abbrev | gender | district | nonvoting | congress | last_name | first_name | district_code | icpsr | bioname | district_code_voteview |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| R | Democratic Party | 108117300 | ALEXANDER | RODNEY | House | LA | M | 5 | voting | 108 | ALEXANDER | Rodney | 5 | 20327 | ALEXANDER, Rodney | 5 |
| I | Republican Party | 107119090 | CRENSHAW | ANDER | House | FL | M | 4 | voting | 107 | CRENSHAW | Ander | 4 | 20111 | CRENSHAW, Ander | 4 |
| I | Democratic Party | 111116471 | LIEBERMAN | JOSEPH | Senate | CT | M | NA | voting | 111 | LIEBERMAN | Joseph | 0 | 15704 | LIEBERMAN, Joseph I. | 0 |
| I | Democratic Party | 110116471 | LIEBERMAN | JOSEPH | Senate | CT | M | NA | voting | 110 | LIEBERMAN | Joseph | 0 | 15704 | LIEBERMAN, Joseph I. | 0 |
| D | Republican Party | 110114511 | SPECTER | ARLEN | Senate | PA | M | NA | voting | 110 | SPECTER | Arlen | 0 | 14910 | SPECTER, Arlen | 0 |
| D | Republican Party | 109114511 | SPECTER | ARLEN | Senate | PA | M | NA | voting | 109 | SPECTER | Arlen | 0 | 14910 | SPECTER, Arlen | 0 |
# district mismatches
# fl.speakers %>% filter(district_code != district_code_voteview) %>% kablebox()# start with fresh data (can eventually delete everything above up to map function if we use this method)
# speakers dataframe
speakers <- map_dfr(speakermaps$file, get_speakers)
# match to voteview format
speakers %<>%
mutate(congress = congress %>% str_remove("^0") %>% as.numeric(),
chamber = chamber %>%
str_replace("S$", "Senate") %>%
str_replace("H$", "House"),
district = ifelse(chamber == "Senate", 0, district),
# clean up some 0s that are actually missing data
district = ifelse(chamber == "House" & district == "0", NA, district) %>%
as.numeric()
)
speakers %<>%
rename(state_abbrev = state)
# FIXME focusing on the 107-111th for now
speakers %<>%
filter(congress > 106)
speakers %<>%
# combine information into one field
mutate(speaker = paste(chamber, firstname, lastname, state_abbrev) %>%
str_replace("Senate", "Senator") %>%
str_replace("House", "Representative") %>%
str_replace("Representative R\\.", "Representative"))
# name matching function
source(here("code", "nameMethods.R"))
# typos in these data include "WM" instead of "William" and "R. Miller" for "Representative Brad Miller", the rest are known permutations and nicknames
source(here("code", "MemberNameTypos.R"))
speakers1 <- speakers %>% extractMemberName(col_name = "speaker", members = members)
# FIXME, overmatches party switchers (and maybe others)
crosswalk <- speakers1 %>%
select(speaker, speakerid, icpsr, bioname, congress, chamber, nonvoting) %>%
distinct() %>%
left_join(members %>%
select(icpsr, congress, chamber, state_abbrev, party_name, district_code)) %>%
distinct()My congressional name search function gets us 100% matches when we don’t use party information, but party-switchers get an ICPSR for both parties. When we match on party, we find that some party info in the speakers data are incorrect.
# failed to match in crosswalk
crosswalk %>% filter(is.na(icpsr) & nonvoting == "voting") %>%
kablebox()| speaker | speakerid | icpsr | bioname | congress | chamber | nonvoting | state_abbrev | party_name | district_code |
|---|---|---|---|---|---|---|---|---|---|
Missing district data in the speakers data is fine; we can fill it in from voteview.
I have not yet found incorrect districts, but these would cause this to fail (and rightly so, getting the district right is important!).
However, the speakers data has a few incorrect parties. -Crenshaw was never an independent -alexander was a D in the 108th -specter switched in 2009 (11th), he was not a D in the 109 or 110 -Lieberman called himself an “independent Democrat” and caucused with the Ds, so he is only a D in voteview
These can be corrected, or we can use those from voteview if we decided to go with their first party or modal party. There are likely more instances in the speakers data where their party is coded inconsistently with either principle.
This result is identical to fastLink.
# failed to match in data
test <- speakers %>%
left_join(crosswalk) %>%
left_join(members %>% select(icpsr, chamber, congress, state_abbrev, party_name, district_code) %>% distinct()) %>%
# require district match where there is district data in speaker data
filter((district == district_code | is.na(district) ) ) %>%
# drop party switchers
filter(party == str_sub(party_name, 1,1) | is.na(party_name)) %>%
distinct()
speakers %>%
anti_join(test) %>%
select(party, everything()) %>%
kablebox()| party | speakerid | lastname | firstname | chamber | state_abbrev | gender | district | nonvoting | congress | speaker |
|---|---|---|---|---|---|---|---|---|---|---|
| I | 107119090 | CRENSHAW | ANDER | House | FL | M | 4 | voting | 107 | Representative ANDER CRENSHAW FL |
| R | 108117300 | ALEXANDER | RODNEY | House | LA | M | 5 | voting | 108 | Representative RODNEY ALEXANDER LA |
| D | 109114511 | SPECTER | ARLEN | Senate | PA | M | 0 | voting | 109 | Senator ARLEN SPECTER PA |
| D | 110114511 | SPECTER | ARLEN | Senate | PA | M | 0 | voting | 110 | Senator ARLEN SPECTER PA |
| I | 110116471 | LIEBERMAN | JOSEPH | Senate | CT | M | 0 | voting | 110 | Senator JOSEPH LIEBERMAN CT |
| I | 111116471 | LIEBERMAN | JOSEPH | Senate | CT | M | 0 | voting | 111 | Senator JOSEPH LIEBERMAN CT |