netlit
This vignette shows examples of assessing bias in literature review networks based on covariates from metadata about the studies and authors included or excluded from the review on redistricting in the main manuscript. Specifically, for each study, we collect metadata on the lead author’s gender, H-Index, and total number of citations. We then assess the impact of selecting studies on covariates in two ways:
First, we subset the network (e.g., to studies where the lead author is a man) and observe how many nodes and edges are missing in these subsets. This reveals the contributions of underrepresented scholars to the network by showing what we lose if they are excluded.
Second, we draw random samples of 100 studies weighted by covariates. This simulates a literature review that is biased (e.g., toward scholars who are men or have many citations). We then compare these biased samples to an unweighted random sample of studies in the network.
<- literature_long %>%
lit distinct(to, from) %>%
review()
lit
## A netlit_review object with the following components:
##
## $edgelist
## - 69 edges
## - edge attributes: edge_betweenness
## $nodelist
## - 56 nodes
## - node attributes: degree_in, degree_out, degree_total, betweenness
## $graph
## an igraph object
# best seed 1,4, *5*
set.seed(5)
<- function(g){
netlit_plot ggraph(g, layout = 'fr') +
geom_node_point(
aes(color = degree_total %>% as.factor() ),
size = 6,
alpha = .7
+
) geom_edge_arc2(
start_cap = circle(3, 'mm'),
end_cap = circle(6, 'mm'),
aes(
color = edge_betweenness,
),curvature = 0,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
+
) geom_edge_loop(
start_cap = circle(5, 'mm'),
end_cap = circle(2, 'mm'),
aes( color = edge_betweenness),
n = 300,
strength = .6,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
+
) geom_node_text( aes(label = name), size = 2.3) +
::theme_void() +
ggplot2theme(legend.position="bottom") +
labs(edge_color = "Edge Betweenness",
color = "Total Degree\nCentrality",
edge_linetype = "") +
scale_edge_color_viridis(option = "plasma",
begin = 0,
end = .9,
direction = -1,
guide = "legend") +
scale_color_viridis_d(option = "mako",
begin = 1,
end = .5)
}
<- literature_long %>%
g distinct(to, from) %>%
review() %>%
$graph
.
%>%
g netlit_plot()
# for plotting bias
<- function(subgraph){
netlit_bias_plot
# lit with edge attribute indicating missing from subgraph
<- literature_long %>%
lit distinct(to, from) %>%
left_join( subgraph$edgelist %>% distinct(to, from) %>% mutate(missing_edges = "Not missing")
%>%
) mutate(missing_edges = replace_na(missing_edges, "Missing"))
%<>%
lit review(edge_attributes = names(lit))
# missing nodes
<- lit$nodelist$node[!lit$nodelist$node %in% subgraph$nodelist$node]
missing_nodes
set.seed(5)
ggraph(lit$g, layout = 'fr') +
geom_node_point(
aes(color = ifelse(name %in% missing_nodes, "Missing", "Not Missing")),
size = 6,
alpha = .7
+
) geom_edge_arc2(
start_cap = circle(3, 'mm'),
end_cap = circle(6, 'mm'),
aes(
color = missing_edges,
),curvature = 0,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
+
) geom_edge_loop(
start_cap = circle(5, 'mm'),
end_cap = circle(2, 'mm'),
aes(color = missing_edges),
n = 300,
strength = .6,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
+
) geom_node_text( aes(label = name), size = 2.3) +
::theme_void() +
ggplot2theme(legend.position="bottom") +
labs(edge_color = "",
color = "",
edge_linetype = "") +
scale_color_discrete() +
scale_edge_color_discrete()
}
%<>%
literature_long mutate(author_is_man = author_gender == "M")
# biased sample weights
%<>%
literature_long mutate(unbiased = .5,
weight = case_when(
~ .6,
author_is_man !author_is_man ~ .4,
TRUE~ .5
))
# a function to sample the network
<- function(n, literature_long, prob){
sample_lit
# create an index for the sample
<- sample(seq_len(nrow(literature_long)),
samp_idx 100, # 100 draws = number of studies to draw
prob=prob # with prob var provided
)
# subset sample to index
<- literature_long %>%
sample rowid_to_column() %>%
filter(rowid %in% samp_idx) %>%
distinct(to, from) %>%
review()
return(sample)
}
<-1000 n_samples
There are 165 studies in the original literature review. We draw 100 of them—first at random, then weighted random samples. For each type of simulated bias we use 1000 draws.
<- map(1:n_samples, # 100 samples
random_samples
sample_lit,literature_long=literature_long,
prob = literature_long$unbiased)
<- random_samples
samples
<- . %>% pull(edge_betweenness) %>% mean()
mean_edge_betw <- . %>% pull(betweenness) %>% mean()
mean_node_betw <- . %>% pull(degree_total) %>% mean()
mean_node_degree
# make a table of the total number of nodes, edges, and the graph object for plotting
<- function(samples){
summarise_samples <- tibble(
summary #edge stats
edges = samples %>% map(1) %>% modify(nrow) %>% unlist(),
edge_between_mean = samples %>% map(1) %>% modify(mean_edge_betw) %>% unlist(),
# nodes stats
nodes = samples %>% map(2) %>% modify(nrow) %>% unlist(),
node_between_mean = samples %>% map(2) %>% modify(mean_node_betw) %>% unlist(),
node_degree_mean = samples %>% map(2) %>% modify(mean_node_degree) %>% unlist(),
#graph stats
communities = samples %>% map(3) %>% modify(cluster_walktrap) %>% modify(length) %>% unlist(),
diameter = samples %>% map(3) %>% modify(diameter) %>% unlist(),
graph = samples %>% map(3)
)return(summary)
}
<- summarise_samples(samples)
summary
<- summary %>% mutate(
random sample = "Random"
)
# map(random$graph, netlit_plot)
map(random_samples[1:10], netlit_bias_plot)
Average nodes recovered: 43.8
Average node betweenness recovered: 2.9607115
Average edges recovered: 46.94
Average edge betweenness recovered: 5.1360215
Average node degree recovered: 2.1470984
Average communities recovered: 10.12
Average diameter recovered: 4.65
# biased samples
<- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
gender_samples
<- gender_samples
samples
<- summarise_samples(samples)
summary
<- summary %>% mutate(sample = "Gender bias favoring men")
gender
# map(gender_samples[1:10], netlit_bias_plot)
map(gender_samples[1:10], netlit_bias_plot)
Average nodes recovered: 44.25
Average node betweenness recovered: 2.9773211
Average edges recovered: 47.472
Average edge betweenness recovered: 5.1271756
Average node degree recovered: 2.1480552
Average communities recovered: 10.328
Average diameter recovered: 4.667
# biased sample weights
%<>%
literature_long mutate(weight = case_when(
~ 1,
author_is_man !author_is_man ~ .3,
TRUE~ .5
))
# biased samples
<- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
gender_samples
<- gender_samples
samples
<- summarise_samples(samples)
summary
<- summary %>% mutate(
gender sample = "Gender bias favoring men"
)
#map(gender$graph, netlit_plot)
map(gender_samples[1:10], netlit_bias_plot)
Average nodes recovered: 45.339
Average node betweenness recovered: 3.3160159
Average edges recovered: 48.951
Average edge betweenness recovered: 5.5354191
Average node degree recovered: 2.1615376
Average communities recovered: 10.722
Average diameter recovered: 4.837
# biased sample weights
%<>%
literature_long mutate(weight = case_when(
~ .3,
author_is_man !author_is_man ~ 1,
TRUE~ .5
))
<- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
gender_samples2
# biased samples
<- summarise_samples(samples)
summary
<- summary %>% mutate(
gender2 sample = "Gender bias favoring women"
)
#map(gender$graph, netlit_plot)
map(gender_samples2[1:10], netlit_bias_plot)
Average nodes recovered: 42.591
Average node betweenness recovered: 2.3101483
Average edges recovered: 44.627
Average edge betweenness recovered: 4.3232846
Average node degree recovered: 2.0983178
Average communities recovered: 9.96
Average diameter recovered: 4.249
(replacing NA HIndex with 0)
%<>%
literature_long mutate(author_h_index = replace_na(author_h_index, 0 ))
# biased samples
<- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
hindex_samples
<- summarise_samples(samples)
summary
<- summary %>% mutate(
hindex sample = "H-Index bias"
)
#map(gender$graph, netlit_plot)
map(hindex_samples[1:10], netlit_bias_plot)
Average nodes recovered: 42.591
Average node betweenness recovered: 2.3101483
Average edges recovered: 44.627
Average edge betweenness recovered: 4.3232846
Average node degree recovered: 2.0983178
Average communities recovered: 9.96
Average diameter recovered: 4.249
(replacing NA author citations with 0)
%<>%
literature_long mutate(author_citations = replace_na(author_citations, 0 ))
# gender-biased samples
<- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$author_citations)
citations_samples
<- citations_samples
samples
<- summarise_samples(samples)
summary
<- summary %>% mutate(
citations sample = "Citations bias"
)
# map(citations$graph, netlit_plot)
map(citations_samples[1:10], netlit_bias_plot) # %>% .[c(1:10)]
Average nodes recovered: 46.811
Average node betweenness recovered: 3.754296
Average edges recovered: 51.905
Average edge betweenness recovered: 6.1469967
Average node degree recovered: 2.2184447
Average communities recovered: 10.823
Average diameter recovered: 4.638
<- full_join(random, gender) %>%
s full_join(gender2) %>%
full_join(hindex) %>%
full_join(citations)
<- . %>% round(1)
round2
<- s %>% group_by(sample) %>%
s_table select(where(is.numeric)) %>% summarise_all(mean) %>%
group_by(sample) %>%
mutate_all(round2) %>%
arrange(rev(sample))
<- which(s_table$sample == "Random")
color.me
names(s_table) %<>% str_remove("_mean")
%>%
s_table kable(booktabs = T) %>%
kable_styling()
sample | edges | edge_between | nodes | node_between | node_degree | communities | diameter |
---|---|---|---|---|---|---|---|
Random | 46.9 | 5.1 | 43.8 | 3.0 | 2.1 | 10.1 | 4.7 |
H-Index bias | 44.6 | 4.3 | 42.6 | 2.3 | 2.1 | 10.0 | 4.2 |
Gender bias favoring women | 44.6 | 4.3 | 42.6 | 2.3 | 2.1 | 10.0 | 4.2 |
Gender bias favoring men | 49.0 | 5.5 | 45.3 | 3.3 | 2.2 | 10.7 | 4.8 |
Citations bias | 51.9 | 6.1 | 46.8 | 3.8 | 2.2 | 10.8 | 4.6 |
%>%
s ggplot() +
aes(x = nodes, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Nodes Recovered (out of 56)") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
%>%
s ggplot() +
aes(x = edges, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Edges Recovered (out of 69)") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
%>%
s ggplot() +
aes(x = edge_between_mean, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Average Edge Betweenness") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
%>%
s ggplot() +
aes(x = node_between_mean, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Average Node Betweenness") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
%>%
s ggplot() +
aes(x = node_degree_mean, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Average Degree") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
%>%
s ggplot() +
aes(x = communities, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Communities") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
%>%
s ggplot() +
aes(x = diameter, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Diameter") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())