# a function to read speaker data
function(file){
get_speakers <-
message(file)
read_delim(str_c(here("hein-bound/", file) %>% str_remove("/cr") ),
delim = "|") %>%
# drop speech data for now
select(-speech_id) %>%
# extract congress from file name
mutate(congress = str_remove(file, "_.*") ) %>%
distinct()
}
# files
list.files(here("hein-bound") %>% str_remove("/cr") ) %>%
heinbound <- enframe(value = "file")
# subset to speakermap files from recent congresses
heinbound %>%
speakermaps <- filter(str_detect(file, "SpeakerMap"),
str_detect(file, "^1") #FIXME subsetting to >99th congress
)
# speakers dataframe
map_dfr(speakermaps$file, get_speakers)
speakers <-
# match to voteview format
%<>%
speakers mutate(last_name = lastname,
first_name = str_to_sentence(firstname),
congress = congress %>% str_remove("^0") %>% as.numeric(),
chamber = chamber %>%
str_replace("S$", "Senate") %>%
str_replace("H$", "House"),
district_code = ifelse(chamber == "Senate", 0, district),
# clean up some 0s that are actually missing data
district_code = ifelse(chamber == "House" & district_code == "0", NA, district_code) %>%
as.numeric()
)
%<>%
speakers rename(state_abbrev = state)
# FIXME focusing on the 107-111th for now
%<>%
speakers filter(congress > 106)
# Devin's members data (expanded from voteview)
load(here("data", "members.Rdata"))
# members %>% select(names(members)[names(members) %in% names(speakers)])
# Make mixed case names, like "McConnell" upper case (to go the other way, we would need to fix them one by one)
%<>%
members mutate(last_name = str_to_upper(last_name),
party = str_sub(party_name, 1, 1))
A few last names also differ, e.g. Mary Bono Mack also fails to match because she is just Mary Bono in Voteview.
%>%
speakers select( -district_code) %>%
anti_join(members) %>%
select(-first_name) %>%
left_join(members) %>%
select(voteview_first_name = first_name,
speakermap_firstname = firstname,
%>%
lastname, nonvoting) distinct() %>%
arrange(lastname) %>% kablebox()
voteview_first_name | speakermap_firstname | lastname | nonvoting |
---|---|---|---|
NA | ANIBAL | ACEVEDO-VILA | nonvoting |
W | W. | AKIN | voting |
NA | RODNEY | ALEXANDER | voting |
A | WAYNE | ALLARD | voting |
Kenneth | KEN | BENTSEN | voting |
Jesse | JEFF | BINGAMAN | voting |
Robert | ROB | BISHOP | voting |
Josiah | JO | BONNER | voting |
NA | MARY | BONO MACK | voting |
NA | MADELEINE | BORDALLO | nonvoting |
Daniel | DAN | BOREN | voting |
F | ALLEN | BOYD | voting |
Sam | SAMUEL | BROWNBACK | voting |
Vernon | VERN | BUCHANAN | voting |
James | JIM | BUNNING | voting |
Danny | DAN | BURTON | voting |
Herbert | H. | CALLAHAN | voting |
David | DAVE | CAMP | voting |
Chris | CHRISTOPHER | CARNEY | voting |
A | BEN | CHANDLER | voting |
NA | DONNA | CHRISTENSEN | nonvoting |
William | WM. | CLAY | voting |
Joseph | J. | CLELAND | voting |
William | THAD | COCHRAN | voting |
Stephen | STEVE | COHEN | voting |
Kenneth | K. | CONAWAY | voting |
Christopher | CHRIS | COONS | voting |
James | JIM | COOPER | voting |
Robert | BOB | CORKER | voting |
Charles | C. | COX | voting |
NA | ANDER | CRENSHAW | voting |
Thomas | JO ANN | DAVIS | voting |
Jo | JO ANN | DAVIS | voting |
Jim | JAMES | DAVIS | voting |
Geoffrey | GEOFF | DAVIS | voting |
John | NATHAN | DEAL | voting |
Bill | WILLIAM | DELAHUNT | voting |
James | JIM | DEMINT | voting |
Jo | JO ANN | EMERSON | voting |
Bobby | BOB | ETHERIDGE | voting |
Robert | TERRY | EVERETT | voting |
NA | ENI | FALEOMAVAEGA | nonvoting |
James | J. | FORBES | voting |
NA | LUIS | FORTUNO | nonvoting |
Al | ALAN | FRANKEN | voting |
Robert | BOB | GOODLATTE | voting |
Daniel | BOB | GRAHAM | voting |
William | PHIL | GRAMM | voting |
Samuel | SAM | GRAVES | voting |
Raymond | GENE | GREEN | voting |
Al | GENE | GREEN | voting |
John | J. | HASTERT | voting |
Richard | DOC | HASTINGS | voting |
Robert | ROBIN | HAYES | voting |
Thomas | TIM | HOLDEN | voting |
Mike | MICHAEL | HONDA | voting |
John | STEPHEN | HORN | voting |
Timothy | TIM | HUTCHINSON | voting |
Kathryn | KAY | HUTCHISON | voting |
Robert | BOB | INGLIS | voting |
Johnny | JOHN | ISAKSON | voting |
Timothy | TIM | JOHNSON | voting |
Hank | HENRY | JOHNSON | voting |
Steven | STEVE | KAGEN | voting |
Marcia | MARCY | KAPTUR | voting |
Richard | RIC | KELLER | voting |
Mary | MARY JO | KILROY | voting |
Thomas | TOM | LANTOS | voting |
Richard | RICK | LARSEN | voting |
Thomas | TOM | LATHAM | voting |
Charles | JERRY | LEWIS | voting |
NA | JOSEPH | LIEBERMAN | voting |
Dave | DAVID | LOEBSACK | voting |
Chester | TRENT | LOTT | voting |
Ken | KENNETH | LUCAS | voting |
Jim | JAMES | MARSHALL | voting |
Melquiades | MEL | MARTINEZ | voting |
James | JIM | MATHESON | voting |
Addison | MITCH | MCCONNELL | voting |
James | JIM | MCDERMOTT | voting |
Daniel | JEFF | MILLER | voting |
Jefferson | JEFF | MILLER | voting |
Daniel | DAN | MILLER | voting |
Jefferson | DAN | MILLER | voting |
Brad | R. | MILLER | voting |
Walt | WALTER | MINNICK | voting |
Gwendolynne | GWEN | MOORE | voting |
Timothy | TIM | MURPHY | voting |
Clarence | BILL | NELSON | voting |
Earl | BEN | NELSON | voting |
Donald | DON | NICKLES | voting |
NA | ELEANOR | NORTON | nonvoting |
C | C.L. | OTTER | voting |
William | BILL | PASCRELL | voting |
Tom | THOMAS | PERRIELLO | voting |
NA | PEDRO | PIERLUISI | nonvoting |
John | JACK | QUINN | voting |
Thomas | TOM | REED | voting |
Denny | DENNIS | REHBERG | voting |
Robert | BOB | RILEY | voting |
Charles | PAT | ROBERTS | voting |
Michael | MIKE | ROSS | voting |
Margaret | MARGE | ROUKEMA | voting |
C | C. | RUPPERSBERGER | voting |
Timothy | TIM | RYAN | voting |
NA | GREGORIO | SABLAN | nonvoting |
Kenneth | KEN | SALAZAR | voting |
William | BILL | SALI | voting |
Hugh | H. | SAXTON | voting |
Charles | JOE | SCARBOROUGH | voting |
Robert | BOB | SCHAFFER | voting |
Frank | F. | SENSENBRENNER | voting |
Eugene | E. | SHAW | voting |
Don | DONALD | SHERWOOD | voting |
Clifford | RONNIE | SHOWS | voting |
William | BILL | SHUSTER | voting |
Isaac | IKE | SKELTON | voting |
Zack | ZACHARY | SPACE | voting |
NA | ARLEN | SPECTER | voting |
Karen | JACKIE | SPEIER | voting |
Deborah | DEBBIE | STABENOW | voting |
Theodore | TED | STEVENS | voting |
Wilbert | WILLIAM | TAUZIN | voting |
Gary | GENE | TAYLOR | voting |
Michael | MIKE | THOMPSON | voting |
William | MAC | THORNBERRY | voting |
James | J. | THURMOND | voting |
Alice | DINA | TITUS | voting |
Nicola | NIKI | TSONGAS | voting |
Jim | JAMES | TURNER | voting |
Thomas | TOM | UDALL | voting |
NA | ROBERT | UNDERWOOD | nonvoting |
Frederick | FRED | UPTON | voting |
Christopher | CHRIS | VAN HOLLEN | voting |
Tim | TIMOTHY | WALZ | voting |
Zachary | ZACH | WAMP | voting |
Wesley | WES | WATKINS | voting |
Julius | J.C. | WATTS | voting |
James | JIM | WEBB | voting |
Wayne | W. | WELDON | voting |
Wayne | ED | WHITFIELD | voting |
Addison | JOE | WILSON | voting |
Charlie | CHARLES | WILSON | voting |
Ronald | RON | WYDEN | voting |
Donald | DON | YOUNG | voting |
Charles | C. | YOUNG | voting |
(Hereth Sandlin, Cubin, Rehberg, Young)
# mismatches
%>%
speakers select(-first_name) %>%
anti_join(members) %>%
kablebox()
speakerid | lastname | firstname | chamber | state_abbrev | gender | party | district | nonvoting | congress | last_name | district_code |
---|---|---|---|---|---|---|---|---|---|---|---|
107115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 107 | CASTLE | NA |
107120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 107 | NORTON | NA |
107117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 107 | REHBERG | NA |
107113400 | UNDERWOOD | ROBERT | House | GU | M | D | 0 | nonvoting | 107 | UNDERWOOD | NA |
107117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 107 | CHRISTENSEN | NA |
107115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 107 | POMEROY | NA |
107118220 | SANDERS | BERNARD | House | VT | M | I | 0 | voting | 107 | SANDERS | NA |
107119090 | CRENSHAW | ANDER | House | FL | M | I | 4 | voting | 107 | CRENSHAW | 4 |
107121540 | THUNE | JOHN | House | SD | M | R | 0 | voting | 107 | THUNE | NA |
107113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | A | 0 | nonvoting | 107 | ACEVEDO-VILA | NA |
107117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 107 | FALEOMAVAEGA | NA |
107114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 107 | CUBIN | NA |
107121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 107 | YOUNG | NA |
108120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 108 | NORTON | NA |
108115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 108 | CASTLE | NA |
108118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 108 | BORDALLO | NA |
108117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 108 | FALEOMAVAEGA | NA |
108117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 108 | CHRISTENSEN | NA |
108117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 108 | REHBERG | NA |
108118220 | SANDERS | BERNARD | House | VT | M | I | 0 | voting | 108 | SANDERS | NA |
108113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | P | 0 | nonvoting | 108 | ACEVEDO-VILA | NA |
108115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 108 | POMEROY | NA |
108114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 108 | CUBIN | NA |
108113710 | JANKLOW | WILLIAM | House | SD | M | R | 0 | voting | 108 | JANKLOW | NA |
108117300 | ALEXANDER | RODNEY | House | LA | M | R | 5 | voting | 108 | ALEXANDER | 5 |
108121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 108 | YOUNG | NA |
108116660 | BONO MACK | MARY | House | CA | F | R | 45 | voting | 108 | BONO MACK | 45 |
109114511 | SPECTER | ARLEN | Senate | PA | M | D | NA | voting | 109 | SPECTER | 0 |
109117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 109 | FALEOMAVAEGA | NA |
109120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 109 | NORTON | NA |
109118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 109 | BORDALLO | NA |
109116660 | BONO MACK | MARY | House | CA | F | R | 45 | voting | 109 | BONO MACK | 45 |
109118220 | SANDERS | BERNARD | House | VT | M | I | 0 | voting | 109 | SANDERS | NA |
109115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 109 | POMEROY | NA |
109115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 109 | CASTLE | NA |
109117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 109 | REHBERG | NA |
109114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 109 | FORTUNO | NA |
109117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 109 | CHRISTENSEN | NA |
109115640 | HERSETH SANDLIN | STEPHANIE | House | SD | F | D | 0 | voting | 109 | HERSETH SANDLIN | NA |
109114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 109 | CUBIN | NA |
109121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 109 | YOUNG | NA |
110114511 | SPECTER | ARLEN | Senate | PA | M | D | NA | voting | 110 | SPECTER | 0 |
110121720 | WELCH | PETER | House | VT | M | D | 0 | voting | 110 | WELCH | NA |
110116471 | LIEBERMAN | JOSEPH | Senate | CT | M | I | NA | voting | 110 | LIEBERMAN | 0 |
110120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 110 | NORTON | NA |
110117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 110 | CHRISTENSEN | NA |
110115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 110 | CASTLE | NA |
110115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 110 | POMEROY | NA |
110116660 | BONO MACK | MARY | House | CA | F | R | 45 | voting | 110 | BONO MACK | 45 |
110114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 110 | FORTUNO | NA |
110118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 110 | BORDALLO | NA |
110117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 110 | FALEOMAVAEGA | NA |
110115640 | HERSETH SANDLIN | STEPHANIE | House | SD | F | D | 0 | voting | 110 | HERSETH SANDLIN | NA |
110114670 | CUBIN | BARBARA | House | WY | F | R | 0 | voting | 110 | CUBIN | NA |
110117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 110 | REHBERG | NA |
110121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 110 | YOUNG | NA |
111118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 111 | BORDALLO | NA |
111116471 | LIEBERMAN | JOSEPH | Senate | CT | M | I | NA | voting | 111 | LIEBERMAN | 0 |
111120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 111 | NORTON | NA |
111115640 | HERSETH SANDLIN | STEPHANIE | House | SD | F | D | 0 | voting | 111 | HERSETH SANDLIN | NA |
111115450 | CASTLE | MICHAEL | House | DE | M | R | 0 | voting | 111 | CASTLE | NA |
111117040 | REHBERG | DENNIS | House | MT | M | R | 0 | voting | 111 | REHBERG | NA |
111117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 111 | CHRISTENSEN | NA |
111115960 | POMEROY | EARL | House | ND | M | D | 0 | voting | 111 | POMEROY | NA |
111121180 | SABLAN | GREGORIO | House | MP | M | D | 0 | nonvoting | 111 | SABLAN | NA |
111120320 | LUMMIS | CYNTHIA | House | WY | F | R | 0 | voting | 111 | LUMMIS | NA |
111121720 | WELCH | PETER | House | VT | M | D | 0 | voting | 111 | WELCH | NA |
111120820 | PIERLUISI | PEDRO | House | PR | M | D | 0 | nonvoting | 111 | PIERLUISI | NA |
111117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 111 | FALEOMAVAEGA | NA |
111121820 | YOUNG | DON | House | AK | M | R | 0 | voting | 111 | YOUNG | NA |
If we drop first names and districts, we match almost all voting members, but we also over-match.
Without matching on first names, we get duplicate matches, e.g., André CARSON was appointed to fill his grandmother’s seat, Julia May CARSON:
# overmatches in speakers data
%>%
speakers count(congress, chamber, last_name, state_abbrev, party, district) %>%
filter(n>1) %>%
kablebox()
congress | chamber | last_name | state_abbrev | party | district | n |
---|---|---|---|---|---|---|
110 | House | CARSON | IN | D | 7 | 2 |
# potential overmatches
%>%
members count(congress, chamber, last_name, state_abbrev, party, district_code) %>%
filter(n>1) %>%
kablebox()
congress | chamber | last_name | state_abbrev | party | district_code | n |
---|---|---|---|---|---|---|
105 | House | BONO | CA | R | 44 | 2 |
105 | House | CAPPS | CA | D | 22 | 2 |
106 | Senate | CHAFEE | RI | R | 0 | 2 |
107 | House | SHUSTER | PA | R | 9 | 2 |
110 | House | CARSON | IN | D | 7 | 2 |
112 | House | PAYNE | NJ | D | 10 | 2 |
Without matching on first names OR districts (because of missing district data), we get even more duplicate/potential duplicate matches.
# overmatches in speakers data
%>%
speakers count(congress, chamber, last_name, state_abbrev, party) %>%
filter(n>1) %>%
kablebox()
congress | chamber | last_name | state_abbrev | party | n |
---|---|---|---|---|---|
107 | House | DAVIS | VA | R | 2 |
107 | House | MILLER | FL | R | 2 |
108 | House | DAVIS | VA | R | 2 |
108 | House | DIAZ-BALART | FL | R | 2 |
108 | House | SANCHEZ | CA | D | 2 |
109 | House | DAVIS | VA | R | 2 |
109 | House | DIAZ-BALART | FL | R | 2 |
109 | House | GREEN | TX | D | 2 |
109 | House | SANCHEZ | CA | D | 2 |
110 | House | CARSON | IN | D | 2 |
110 | House | DAVIS | VA | R | 2 |
110 | House | DIAZ-BALART | FL | R | 2 |
110 | House | GREEN | TX | D | 2 |
110 | House | SANCHEZ | CA | D | 2 |
111 | House | DIAZ-BALART | FL | R | 2 |
111 | House | GREEN | TX | D | 2 |
111 | House | SANCHEZ | CA | D | 2 |
# potential overmatches
%>%
members count(congress, chamber, last_name, state_abbrev, party) %>%
filter(n>1) %>%
kablebox()
congress | chamber | last_name | state_abbrev | party | n |
---|---|---|---|---|---|
105 | House | BONO | CA | R | 2 |
105 | House | CAPPS | CA | D | 2 |
106 | Senate | CHAFEE | RI | R | 2 |
107 | House | DAVIS | VA | R | 2 |
107 | House | MILLER | FL | R | 2 |
107 | House | SHUSTER | PA | R | 2 |
108 | House | DAVIS | VA | R | 2 |
108 | House | DIAZ-BALART | FL | R | 2 |
108 | House | SANCHEZ | CA | D | 2 |
109 | House | DAVIS | VA | R | 2 |
109 | House | DIAZ-BALART | FL | R | 2 |
109 | House | GREEN | TX | D | 2 |
109 | House | SANCHEZ | CA | D | 2 |
110 | House | CARSON | IN | D | 2 |
110 | House | DAVIS | VA | R | 2 |
110 | House | DIAZ-BALART | FL | R | 2 |
110 | House | GREEN | TX | D | 2 |
110 | House | SANCHEZ | CA | D | 2 |
111 | House | DIAZ-BALART | FL | R | 2 |
111 | House | GREEN | TX | D | 2 |
111 | House | SANCHEZ | CA | D | 2 |
112 | House | GREEN | TX | D | 2 |
112 | House | PAYNE | NJ | D | 2 |
112 | House | SANCHEZ | CA | D | 2 |
113 | House | GREEN | TX | D | 2 |
113 | House | MALONEY | NY | D | 2 |
113 | House | SANCHEZ | CA | D | 2 |
114 | House | GREEN | TX | D | 2 |
114 | House | MALONEY | NY | D | 2 |
114 | House | SANCHEZ | CA | D | 2 |
115 | House | GREEN | TX | D | 2 |
115 | House | MALONEY | NY | D | 2 |
115 | House | ROONEY | FL | R | 2 |
116 | House | MALONEY | NY | D | 2 |
Just for fun, we’ll give this a try, even though we have a custom solution. fastLink returns a match for voting members.
library(fastLink)
fastLink(speakers,
fl.out <-
members, varnames = c("chamber", "state_abbrev", "party", "congress", "last_name", "first_name", "district_code"), return.df = T, return.all = F)
##
## ====================
## fastLink(): Fast Probabilistic Record Linkage
## ====================
##
## If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
## Calculating matches for each variable.
## Getting counts for parameter estimation.
## Running the EM algorithm.
## Getting the indices of estimated matches.
## Deduping the estimated matches.
## Getting the match patterns for each estimated match.
%>% filter(nonvoting == "voting") %>% nrow() speakers
## [1] 2715
$dfA.match %>% nrow() fl.out
## [1] 2715
$dfA.match %>% head() %>% kablebox() fl.out
speakerid | lastname | firstname | chamber | state_abbrev | gender | party | district | nonvoting | congress | last_name | first_name | district_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
982 | 108115500 | DEAL | NATHAN | House | GA | M | R | 10 | voting | 108 | DEAL | Nathan | 10 |
617 | 108112321 | CAMPBELL | BEN | Senate | CO | M | R | NA | voting | 108 | CAMPBELL | Ben | 0 |
787 | 108117630 | HALL | RALPH | House | TX | M | R | 4 | voting | 108 | HALL | Ralph | 4 |
848 | 108113900 | TAUZIN | WILLIAM | House | LA | M | R | 3 | voting | 108 | TAUZIN | William | 3 |
719 | 108121351 | SHELBY | RICHARD | Senate | AL | M | R | NA | voting | 108 | SHELBY | Richard | 0 |
726 | 108113101 | JEFFORDS | JAMES | Senate | VT | M | I | NA | voting | 108 | JEFFORDS | James | 0 |
$dfB.match %>% head() %>% kablebox() fl.out
chamber | congress | bioname | pattern | first_name | first_initial | common_name | middle_name | middle_initial | maiden_name | last_name | add_last_name | id | icpsr | party_code | cqlabel | state | state_abbrev | bioImgURL | seo_name | district_code | party_name | nominate.dim2 | nominate.dim1 | nominate.geo_mean_probability | party_size | first_last | first_maiden | common_last | first_middle_last | first_initial_last | common_middle_last | last | last_comma_first | last_first | first_maiden_last | common_middle_initial_last | common_maiden | commoninitial_last | first_middle_initial_last | firstinitial_middleinitial_last | last_comma_initial | last_comma_commoninitial | last_comma_common | maiden_comma_first | maiden_comma_firstinitial | chamber_last | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | House | 108 | DEAL, John Nathan | john deal|john nathan deal|deal|john n deal|nathan deal|nathan nathan deal|nathan n deal|deal|(^|senator |representative )dealdeal, nathan|deal, john|deal john|deal, jdeal, nrepresentative dealj n deal | John | J | Nathan | Nathan | N | NA | DEAL | NA | MH10899342 | 99342 | 200 | (GA-10) | georgia | GA | 099342.jpg | john-nathan-deal | 10 | Republican Party | 0.139 | 0.581 | 0.8970000000000000 | 231 | John DEAL | 404error | Nathan DEAL | John Nathan DEAL | DEAL | Nathan Nathan DEAL | (^|senator |representative )DEAL/td> | DEAL, John | DEAL John | 404error | Nathan N DEAL | 404error | DEAL | John N DEAL | J N DEAL | DEAL, J/td> | DEAL, N/td> | DEAL, Nathan | 404error | 404error | Representative DEAL/td> | R |
3 | Senate | 108 | CAMPBELL, Ben Nighthorse | ben campbell|ben nighthorse campbell|campbell|ben n campbell|benjamin campbell|benjamin nighthorse campbell|benjamin n campbell|(^|senator |representative )campbellcampbell, benjamin|campbell, ben|campbell ben|campbell, bsenator campbellb n campbell | Ben | B | Benjamin | Nighthorse | N | NA | CAMPBELL | NA | MS10895407 | 95407 | 200 | (CO) | colorado | CO | 095407.jpg | ben-nighthorse-campbell | 0 | Republican Party | 0.132 | 0.230 | 0.8220000000000000 | 51 | Ben CAMPBELL | 404error | Benjamin CAMPBELL | Ben Nighthorse CAMPBELL | CAMPBELL | Benjamin Nighthorse CAMPBELL | (^|senator |representative )CAMPBELL/td> | CAMPBELL, Ben | CAMPBELL Ben | 404error | Benjamin N CAMPBELL | 404error | CAMPBELL | Ben N CAMPBELL | B N CAMPBELL | CAMPBELL, B/td> | CAMPBELL, B/td> | CAMPBELL, Benjamin | 404error | 404error | Senator CAMPBELL/td> | R |
4 | House | 108 | HALL, Ralph Moody | ralph hall|ralph moody hall|hall|ralph m hall|hall|(^|senator |representative )hallhall, ralph|hall ralph|hall, rrepresentative hallr m hall | Ralph | R | NA | Moody | M | NA | HALL | NA | MH10894828 | 94828 | 200 | (TX-04) | texas | TX | 094828.jpg | ralph-moody-hall | 4 | Republican Party | 0.308 | 0.424 | 0.8149999999999999 | 231 | Ralph HALL | 404error | 404error | Ralph Moody HALL | HALL | 404error | (^|senator |representative )HALL/td> | HALL, Ralph | HALL Ralph | 404error | 404error | 404error | HALL | Ralph M HALL | R M HALL | HALL, R/td> | 404error | 404error | 404error | 404error | Representative HALL/td> | R |
5 | House | 108 | TAUZIN, Wilbert Joseph (Billy) | wilbert tauzin|wilbert joseph tauzin|tauzin|wilbert j tauzin|billy tauzin|billy joseph tauzin|billy j tauzin|tauzin|(^|senator |representative )tauzintauzin, billy|tauzin, wilbert|tauzin wilbert|tauzin, wtauzin, brepresentative tauzinw j tauzin | Wilbert | W | Billy | Joseph | J | NA | TAUZIN | NA | MH10894679 | 94679 | 200 | (LA-03) | louisiana | LA | 094679.jpg | wilbert-joseph-billy-tauzin | 3 | Republican Party | 0.160 | 0.341 | 0.9100000000000000 | 231 | Wilbert TAUZIN | 404error | Billy TAUZIN | Wilbert Joseph TAUZIN | TAUZIN | Billy Joseph TAUZIN | (^|senator |representative )TAUZIN/td> | TAUZIN, Wilbert | TAUZIN Wilbert | 404error | Billy J TAUZIN | 404error | TAUZIN | Wilbert J TAUZIN | W J TAUZIN | TAUZIN, W/td> | TAUZIN, B/td> | TAUZIN, Billy | 404error | 404error | Representative TAUZIN/td> | R |
6 | Senate | 108 | SHELBY, Richard C. | richard shelby|richard c shelby|shelby|rich shelby|rich c shelby|(^|senator |representative )shelbyshelby, rich|shelby, richard|shelby richard|shelby, rsenator shelbyr c shelby | Richard | R | Rich | C | C | NA | SHELBY | NA | MS10894659 | 94659 | 200 | (AL) | alabama | AL | 094659.jpg | richard-c-shelby | 0 | Republican Party | 0.520 | 0.429 | 0.8766256675360421 | 51 | Richard SHELBY | 404error | Rich SHELBY | Richard C SHELBY | SHELBY | Rich C SHELBY | (^|senator |representative )SHELBY/td> | SHELBY, Richard | SHELBY Richard | 404error | Rich C SHELBY | 404error | SHELBY | Richard C SHELBY | R C SHELBY | SHELBY, R/td> | SHELBY, R/td> | SHELBY, Rich | 404error | 404error | Senator SHELBY/td> | R |
7 | Senate | 108 | JEFFORDS, James Merrill | james jeffords|james merrill jeffords|jeffords|james m jeffords|jim jeffords|jim merrill jeffords|jim m jeffords|(^|senator |representative )jeffordsjeffords, jim|jeffords, james|jeffords james|jeffords, jsenator jeffordsj m jeffords | James | J | Jim | Merrill | M | NA | JEFFORDS | NA | MS10894240 | 94240 | 328 | (VT) | vermont | VT | 094240.jpg | james-merrill-jeffords | 0 | Independent | -0.603 | -0.277 | 0.7340000000000000 | 1 | James JEFFORDS | 404error | Jim JEFFORDS | James Merrill JEFFORDS | JEFFORDS | Jim Merrill JEFFORDS | (^|senator |representative )JEFFORDS/td> | JEFFORDS, James | JEFFORDS James | 404error | Jim M JEFFORDS | 404error | JEFFORDS | James M JEFFORDS | J M JEFFORDS | JEFFORDS, J/td> | JEFFORDS, J/td> | JEFFORDS, Jim | 404error | 404error | Senator JEFFORDS/td> | I |
fl.out$dfA.match %>%
fl.speakers <- mutate(icpsr = fl.out$dfB.match$icpsr,
bioname = fl.out$dfB.match$bioname,
party_name = fl.out$dfB.match$party_name,
district_code_voteview = fl.out$dfB.match$district_code) %>%
full_join(speakers)
# failed matches
%>% filter(is.na(icpsr)) %>% kablebox() fl.speakers
speakerid | lastname | firstname | chamber | state_abbrev | gender | party | district | nonvoting | congress | last_name | first_name | district_code | icpsr | bioname | party_name | district_code_voteview |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
107120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 107 | NORTON | Eleanor | NA | NA | NA | NA | NA |
107113400 | UNDERWOOD | ROBERT | House | GU | M | D | 0 | nonvoting | 107 | UNDERWOOD | Robert | NA | NA | NA | NA | NA |
107117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 107 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
107113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | A | 0 | nonvoting | 107 | ACEVEDO-VILA | Anibal | NA | NA | NA | NA | NA |
107117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 107 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
108120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 108 | NORTON | Eleanor | NA | NA | NA | NA | NA |
108118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 108 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
108117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 108 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
108117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 108 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
108113450 | ACEVEDO-VILA | ANIBAL | House | PR | M | P | 0 | nonvoting | 108 | ACEVEDO-VILA | Anibal | NA | NA | NA | NA | NA |
109117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 109 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
109120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 109 | NORTON | Eleanor | NA | NA | NA | NA | NA |
109118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 109 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
109114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 109 | FORTUNO | Luis | NA | NA | NA | NA | NA |
109117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 109 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
110120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 110 | NORTON | Eleanor | NA | NA | NA | NA | NA |
110117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 110 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
110114790 | FORTUNO | LUIS | House | PR | M | R | 0 | nonvoting | 110 | FORTUNO | Luis | NA | NA | NA | NA | NA |
110118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 110 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
110117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 110 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
111118680 | BORDALLO | MADELEINE | House | GU | F | D | 0 | nonvoting | 111 | BORDALLO | Madeleine | NA | NA | NA | NA | NA |
111120680 | NORTON | ELEANOR | House | DC | F | D | 0 | nonvoting | 111 | NORTON | Eleanor | NA | NA | NA | NA | NA |
111117560 | CHRISTENSEN | DONNA | House | VI | F | D | 0 | nonvoting | 111 | CHRISTENSEN | Donna | NA | NA | NA | NA | NA |
111121180 | SABLAN | GREGORIO | House | MP | M | D | 0 | nonvoting | 111 | SABLAN | Gregorio | NA | NA | NA | NA | NA |
111120820 | PIERLUISI | PEDRO | House | PR | M | D | 0 | nonvoting | 111 | PIERLUISI | Pedro | NA | NA | NA | NA | NA |
111117590 | FALEOMAVAEGA | ENI | House | AS | M | D | 0 | nonvoting | 111 | FALEOMAVAEGA | Eni | NA | NA | NA | NA | NA |
# party mismatches
%>% filter(party != party_name %>% str_sub(1, 1)) %>%
fl.speakers select(party, party_name, everything()) %>%
kablebox()
party | party_name | speakerid | lastname | firstname | chamber | state_abbrev | gender | district | nonvoting | congress | last_name | first_name | district_code | icpsr | bioname | district_code_voteview |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
R | Democratic Party | 108117300 | ALEXANDER | RODNEY | House | LA | M | 5 | voting | 108 | ALEXANDER | Rodney | 5 | 20327 | ALEXANDER, Rodney | 5 |
I | Republican Party | 107119090 | CRENSHAW | ANDER | House | FL | M | 4 | voting | 107 | CRENSHAW | Ander | 4 | 20111 | CRENSHAW, Ander | 4 |
I | Democratic Party | 111116471 | LIEBERMAN | JOSEPH | Senate | CT | M | NA | voting | 111 | LIEBERMAN | Joseph | 0 | 15704 | LIEBERMAN, Joseph I. | 0 |
I | Democratic Party | 110116471 | LIEBERMAN | JOSEPH | Senate | CT | M | NA | voting | 110 | LIEBERMAN | Joseph | 0 | 15704 | LIEBERMAN, Joseph I. | 0 |
D | Republican Party | 110114511 | SPECTER | ARLEN | Senate | PA | M | NA | voting | 110 | SPECTER | Arlen | 0 | 14910 | SPECTER, Arlen | 0 |
D | Republican Party | 109114511 | SPECTER | ARLEN | Senate | PA | M | NA | voting | 109 | SPECTER | Arlen | 0 | 14910 | SPECTER, Arlen | 0 |
# district mismatches
# fl.speakers %>% filter(district_code != district_code_voteview) %>% kablebox()
# start with fresh data (can eventually delete everything above up to map function if we use this method)
# speakers dataframe
map_dfr(speakermaps$file, get_speakers)
speakers <-
# match to voteview format
%<>%
speakers mutate(congress = congress %>% str_remove("^0") %>% as.numeric(),
chamber = chamber %>%
str_replace("S$", "Senate") %>%
str_replace("H$", "House"),
district = ifelse(chamber == "Senate", 0, district),
# clean up some 0s that are actually missing data
district = ifelse(chamber == "House" & district == "0", NA, district) %>%
as.numeric()
)
%<>%
speakers rename(state_abbrev = state)
# FIXME focusing on the 107-111th for now
%<>%
speakers filter(congress > 106)
%<>%
speakers # combine information into one field
mutate(speaker = paste(chamber, firstname, lastname, state_abbrev) %>%
str_replace("Senate", "Senator") %>%
str_replace("House", "Representative") %>%
str_replace("Representative R\\.", "Representative"))
# name matching function
source(here("code", "nameMethods.R"))
# typos in these data include "WM" instead of "William" and "R. Miller" for "Representative Brad Miller", the rest are known permutations and nicknames
source(here("code", "MemberNameTypos.R"))
speakers %>% extractMemberName(col_name = "speaker", members = members)
speakers1 <-
# FIXME, overmatches party switchers (and maybe others)
speakers1 %>%
crosswalk <- select(speaker, speakerid, icpsr, bioname, congress, chamber, nonvoting) %>%
distinct() %>%
left_join(members %>%
select(icpsr, congress, chamber, state_abbrev, party_name, district_code)) %>%
distinct()
My congressional name search function gets us 100% matches when we don’t use party information, but party-switchers get an ICPSR for both parties. When we match on party, we find that some party info in the speakers data are incorrect.
# failed to match in crosswalk
%>% filter(is.na(icpsr) & nonvoting == "voting") %>%
crosswalk kablebox()
speaker | speakerid | icpsr | bioname | congress | chamber | nonvoting | state_abbrev | party_name | district_code |
---|---|---|---|---|---|---|---|---|---|
Missing district data in the speakers data is fine; we can fill it in from voteview.
I have not yet found incorrect districts, but these would cause this to fail (and rightly so, getting the district right is important!).
However, the speakers data has a few incorrect parties. -Crenshaw was never an independent -alexander was a D in the 108th -specter switched in 2009 (11th), he was not a D in the 109 or 110 -Lieberman called himself an “independent Democrat” and caucused with the Ds, so he is only a D in voteview
These can be corrected, or we can use those from voteview if we decided to go with their first party or modal party. There are likely more instances in the speakers data where their party is coded inconsistently with either principle.
This result is identical to fastLink.
# failed to match in data
speakers %>%
test <- left_join(crosswalk) %>%
left_join(members %>% select(icpsr, chamber, congress, state_abbrev, party_name, district_code) %>% distinct()) %>%
# require district match where there is district data in speaker data
filter((district == district_code | is.na(district) ) ) %>%
# drop party switchers
filter(party == str_sub(party_name, 1,1) | is.na(party_name)) %>%
distinct()
%>%
speakers anti_join(test) %>%
select(party, everything()) %>%
kablebox()
party | speakerid | lastname | firstname | chamber | state_abbrev | gender | district | nonvoting | congress | speaker |
---|---|---|---|---|---|---|---|---|---|---|
I | 107119090 | CRENSHAW | ANDER | House | FL | M | 4 | voting | 107 | Representative ANDER CRENSHAW FL |
R | 108117300 | ALEXANDER | RODNEY | House | LA | M | 5 | voting | 108 | Representative RODNEY ALEXANDER LA |
D | 109114511 | SPECTER | ARLEN | Senate | PA | M | 0 | voting | 109 | Senator ARLEN SPECTER PA |
D | 110114511 | SPECTER | ARLEN | Senate | PA | M | 0 | voting | 110 | Senator ARLEN SPECTER PA |
I | 110116471 | LIEBERMAN | JOSEPH | Senate | CT | M | 0 | voting | 110 | Senator JOSEPH LIEBERMAN CT |
I | 111116471 | LIEBERMAN | JOSEPH | Senate | CT | M | 0 | voting | 111 | Senator JOSEPH LIEBERMAN CT |