### Load packages
# Standard
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
# Network specific
library(tidygraph) # For tidy-style graph manipulation
library(ggraph) # For ggplot2 style graph plotting
# EconGeo Specific
library(EconGeo) # Economc Geography functions
So, let’s start the fun. I for you extracted Chinese patents from our EPO PATSTAT databases filed at either the EPO or the USTPO. I further provide you adittional data. Lets take a look:
patents <- readRDS("data/CN_patent.rds")
inventors <- readRDS("data/CN_inventor.rds") %>% filter(str_detect(city, "china")) %>% mutate(city = city %>% str_remove("\\,.*"))
el_pat_inv <- readRDS("data/CN_el_inventor_patent.rds")
el_pat_tech <- readRDS("data/CN_el_patent_field.rds")
field_names <- read_csv("data/ipc_technology.csv") %>%
select(field_nr, sector, field_name) %>%
distinct(field_nr, .keep_all = TRUE) %>%
mutate(field_nr = field_nr) %>%
arrange(field_nr)
cities_geo <- inventors %>%
distinct(city, .keep_all = TRUE) %>%
select(city, lon, lat)
reg_pat <- el_pat_inv %>%
arrange(appln_id, invt_seq_nr) %>%
distinct(appln_id, .keep_all = TRUE) %>%
left_join(inventors %>% select(person_id, city), by = "person_id") %>%
left_join(patents %>% select(appln_id, appln_filing_year, nb_citing_docdb_fam), by = "appln_id") %>%
select(-invt_seq_nr, person_id)
Let’s not take a look at the complexity of Chinese patenting activity.
We will here use some neath functions from theR
package EconGeo
. Its only on github, so we have to install it from there.
#devtools::install_github("PABalland/EconGeo", force = T)
library(EconGeo)
First, here we will need to create some big matrices, therefore I will here already define a little helper function which takes an edgelist as input and produces really efficient a sparse M[i,j]
2-mode matrix.
## Helper function
create_sparse_matrix <- function(i.input, j.input){
require(Matrix)
mat <- spMatrix(nrow = i.input %>% n_distinct(),
ncol = j.input %>% n_distinct(),
i = i.input %>% factor() %>% as.numeric(),
j = j.input %>% factor() %>% as.numeric(),
x = rep(1, i.input %>% length() ) )
row.names(mat) <- i.input %>% factor() %>% levels()
colnames(mat) <- j.input %>% factor() %>% levels()
return(mat)
}
Now, we will create this sparse matrix between patents and their associated technology classes.
mat_tech <- create_sparse_matrix(i = el_pat_tech %>% pull(appln_id),
j = el_pat_tech %>% pull(techn_field_nr))
Reminding basic matrix algebra, a 2-mode matrix M[i,j]
can be transformed to a 1-mode adjacency matrix M[j,j]
by taking the dot-product (matrix multiplication) of its transposed version t(M[i,j])
, and into a 1-mode adjacency matrix M[i,i]
by taking the dot-product with itself. In summary: M[i,j] %*% M[i,j] = M[j,j]
and M[i,j] %*% t(M[i,j]) = M[i,i]
.
We can do that very efficiently with the crossprod()
and tcrossprod()
function of the Matrix
package.
mat_tech %<>%
crossprod() %>%
as.matrix()
#mat.pat <- mat.tech %>% tcrossprod() %>% as.matrix() # Would create a patent matrix based on shared IPC classes (Not a good idea here)
mat_tech
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1 9416 909 213 63 209 476 8 411 646 390 15 487 79 43 4
## 2 909 8670 629 572 193 2957 45 555 936 224 4 137 39 4 1
## 3 213 629 7477 4345 289 1083 80 27 174 274 3 134 20 3 0
## 4 63 572 4345 14724 303 2785 271 10 23 354 0 153 12 3 1
## 5 209 193 289 303 1817 418 3 50 15 96 0 90 5 3 0
## 6 476 2957 1083 2785 418 15567 497 255 430 597 20 479 147 19 22
## 7 8 45 80 271 3 497 747 0 2 30 2 85 7 1 0
## 8 411 555 27 10 50 255 0 4826 732 151 5 22 9 90 4
## 9 646 936 174 23 15 430 2 732 4246 195 3 24 48 22 3
## 10 390 224 274 354 96 597 30 151 195 3722 202 227 198 41 82
## 11 15 4 3 0 0 20 2 5 3 202 457 9 22 74 201
## 12 487 137 134 153 90 479 85 22 24 227 9 1824 44 4 2
## 13 79 39 20 12 5 147 7 9 48 198 22 44 1488 24 50
## 14 43 4 3 3 3 19 1 90 22 41 74 4 24 2852 346
## 15 4 1 0 1 0 22 0 4 3 82 201 2 50 346 1420
## 16 4 1 0 0 0 6 0 2 0 15 96 1 77 1283 623
## 17 101 26 0 0 0 6 0 72 62 15 9 2 28 180 31
## 18 5 0 0 0 0 3 0 0 1 3 3 2 7 64 119
## 19 116 28 0 0 1 9 1 143 114 27 22 2 28 390 104
## 20 183 26 10 0 0 12 0 71 31 15 5 4 17 59 6
## 21 220 129 29 1 7 59 1 221 162 40 7 6 29 30 9
## 22 133 39 6 1 1 14 0 156 52 58 18 0 22 17 26
## 23 136 33 4 2 0 12 0 46 34 138 65 20 64 284 65
## 24 23 9 1 0 3 11 0 20 3 56 6 6 49 19 19
## 25 55 65 20 1 1 55 2 36 31 60 4 76 36 3 3
## 26 91 113 15 0 1 33 1 59 64 59 1 19 25 5 3
## 27 145 72 4 1 1 51 5 21 21 77 4 30 58 3 1
## 28 28 23 18 2 4 41 0 19 25 14 3 2 13 16 15
## 29 120 67 22 2 1 27 4 33 76 19 5 9 49 25 31
## 30 96 320 7 1 2 59 0 161 16 28 3 33 22 6 1
## 31 114 222 40 3 3 125 1 22 46 45 5 55 43 2 1
## 32 172 33 15 15 2 63 3 9 16 70 4 84 15 0 0
## 33 94 267 35 26 3 166 4 13 23 32 0 39 54 1 0
## 34 112 122 33 10 16 97 1 34 32 39 0 34 107 5 1
## 35 68 139 56 6 2 122 0 3 6 56 3 46 11 2 0
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 1 4 101 5 116 183 220 133 136 23 55 91 145 28 120 96 114
## 2 1 26 0 28 26 129 39 33 9 65 113 72 23 67 320 222
## 3 0 0 0 0 10 29 6 4 1 20 15 4 18 22 7 40
## 4 0 0 0 0 0 1 1 2 0 1 0 1 2 2 1 3
## 5 0 0 0 1 0 7 1 0 3 1 1 1 4 1 2 3
## 6 6 6 3 9 12 59 14 12 11 55 33 51 41 27 59 125
## 7 0 0 0 1 0 1 0 0 0 2 1 5 0 4 0 1
## 8 2 72 0 143 71 221 156 46 20 36 59 21 19 33 161 22
## 9 0 62 1 114 31 162 52 34 3 31 64 21 25 76 16 46
## 10 15 15 3 27 15 40 58 138 56 60 59 77 14 19 28 45
## 11 96 9 3 22 5 7 18 65 6 4 1 4 3 5 3 5
## 12 1 2 2 2 4 6 0 20 6 76 19 30 2 9 33 55
## 13 77 28 7 28 17 29 22 64 49 36 25 58 13 49 22 43
## 14 1283 180 64 390 59 30 17 284 19 3 5 3 16 25 6 2
## 15 623 31 119 104 6 9 26 65 19 3 3 1 15 31 1 1
## 16 2290 66 73 202 15 14 18 19 5 2 1 1 3 29 1 2
## 17 66 1316 6 367 60 126 33 123 12 6 4 3 55 243 1 7
## 18 73 6 310 29 3 3 0 20 3 10 2 0 4 40 3 1
## 19 202 367 29 1661 161 158 46 220 48 10 23 13 45 80 27 10
## 20 15 60 3 161 1135 232 111 214 76 9 59 15 43 67 38 15
## 21 14 126 3 158 232 1541 107 112 15 52 106 18 85 222 19 24
## 22 18 33 0 46 111 107 554 43 5 2 8 2 34 34 3 2
## 23 19 123 20 220 214 112 43 1711 251 53 28 45 60 92 53 67
## 24 5 12 3 48 76 15 5 251 653 14 13 31 2 20 32 14
## 25 2 6 10 10 9 52 2 53 14 1150 86 7 32 57 7 113
## 26 1 4 2 23 59 106 8 28 13 86 1368 22 17 63 57 96
## 27 1 3 0 13 15 18 2 45 31 7 22 911 0 14 77 89
## 28 3 55 4 45 43 85 34 60 2 32 17 0 565 59 2 8
## 29 29 243 40 80 67 222 34 92 20 57 63 14 59 1102 14 24
## 30 1 1 3 27 38 19 3 53 32 7 57 77 2 14 1003 46
## 31 2 7 1 10 15 24 2 67 14 113 96 89 8 24 46 1512
## 32 2 17 0 6 6 17 1 14 13 66 28 51 4 31 18 131
## 33 2 9 18 6 4 14 2 56 9 96 43 17 8 44 36 196
## 34 2 15 0 12 17 69 9 41 5 58 28 14 67 41 68 43
## 35 0 14 1 24 15 33 3 59 27 61 37 33 3 44 20 183
## 32 33 34 35
## 1 172 94 112 68
## 2 33 267 122 139
## 3 15 35 33 56
## 4 15 26 10 6
## 5 2 3 16 2
## 6 63 166 97 122
## 7 3 4 1 0
## 8 9 13 34 3
## 9 16 23 32 6
## 10 70 32 39 56
## 11 4 0 0 3
## 12 84 39 34 46
## 13 15 54 107 11
## 14 0 1 5 2
## 15 0 0 1 0
## 16 2 2 2 0
## 17 17 9 15 14
## 18 0 18 0 1
## 19 6 6 12 24
## 20 6 4 17 15
## 21 17 14 69 33
## 22 1 2 9 3
## 23 14 56 41 59
## 24 13 9 5 27
## 25 66 96 58 61
## 26 28 43 28 37
## 27 51 17 14 33
## 28 4 8 67 3
## 29 31 44 41 44
## 30 18 36 68 20
## 31 131 196 43 183
## 32 1122 67 23 54
## 33 67 1561 86 130
## 34 23 86 1048 35
## 35 54 130 35 1330
This matrix can be used for the relatedness()
function to transform the co-occurence adjacency matrix to a similarity matrix. This function computes the relatedness (Hidalgo et al., 2007; Boschma et al., 2015; Balland, 2016) between entities (industries, technologies,…) from their co-occurrence (adjacency) matrix. Different normalization procedures are proposed following van Eck and Waltman (2009): association strength, cosine, Jaccard, and an adapted version of the association strength (probability index). I here choose the cosine similarity, since it is not so sensitive to scale.
mat_tech_rel <- mat_tech %>%
relatedness(method = "cosine")
Lets check what happened?
mat_tech_rel[1:10,1:10]
## 1 2 3 4 5 6
## 1 0.000000000 0.12444247 0.031647291 0.008611730 0.065189691 0.05833133
## 2 0.124442471 0.00000000 0.076368437 0.063892840 0.049192202 0.29610968
## 3 0.031647291 0.07636844 0.000000000 0.526742229 0.079944574 0.11770148
## 4 0.008611730 0.06389284 0.526742229 0.000000000 0.077112838 0.27846562
## 5 0.065189691 0.04919220 0.079944574 0.077112838 0.000000000 0.09536861
## 6 0.058331334 0.29610968 0.117701481 0.278465617 0.095368611 0.00000000
## 7 0.003189473 0.01466046 0.028286382 0.088155542 0.002226816 0.14493786
## 8 0.090412297 0.09976656 0.005267541 0.001794887 0.020478111 0.04103197
## 9 0.131096548 0.15521758 0.031316016 0.003808360 0.005667405 0.06382984
## 10 0.080553306 0.03780708 0.050191230 0.059658653 0.036916814 0.09019649
## 7 8 9 10
## 1 0.0031894727 0.090412297 0.1310965481 0.08055331
## 2 0.0146604648 0.099766557 0.1552175800 0.03780708
## 3 0.0282863822 0.005267541 0.0313160155 0.05019123
## 4 0.0881555416 0.001794887 0.0038083600 0.05965865
## 5 0.0022268162 0.020478111 0.0056674045 0.03691681
## 6 0.1449378553 0.041031973 0.0638298363 0.09019649
## 7 0.0000000000 0.000000000 0.0009658712 0.01474587
## 8 0.0000000000 0.000000000 0.1950549954 0.04095274
## 9 0.0009658712 0.195054995 0.0000000000 0.04878808
## 10 0.0147458733 0.040952740 0.0487880757 0.00000000
Now, at the row-column intercept, we will not have the number of co-occurences, but the relatedness index instead.
With this matrix, we can now create the Chinese “Technology Space” a’la Hidalgo & Hausmann (2007). Here we create a network of technology fields. I will delete below average weights to create some sparsity, and calculate the eigenvector centrality. I will here use the Eigenvector centrality as an easy proxi for technological complexity.
library(tidygraph)
g_tech <- mat_tech_rel %>% as_tbl_graph(directed = FALSE) %N>%
left_join(field_names %>% mutate(field_nr = field_nr %>% as.character()), by = c("name" = "field_nr")) %>%
mutate(dgr = centrality_eigen(weights = weight)) %E>%
filter(weight >= mean(weight))
g_tech
## # A tbl_graph: 35 nodes and 177 edges
## #
## # An undirected simple graph with 1 component
## #
## # Edge Data: 177 x 3 (active)
## from to weight
## <int> <int> <dbl>
## 1 1 2 0.124
## 2 1 3 0.0316
## 3 1 5 0.0652
## 4 1 6 0.0583
## 5 1 8 0.0904
## 6 1 9 0.131
## # … with 171 more rows
## #
## # Node Data: 35 x 4
## name sector field_name dgr
## <chr> <chr> <chr> <dbl>
## 1 1 Electrical engineering Electrical machinery, apparatus, energy 0.731
## 2 2 Electrical engineering Audio-visual technology 0.895
## 3 3 Electrical engineering Telecommunications 0.825
## # … with 32 more rows
We will straight away visualize it as a network. We scale the nodes by their centrality, and color them by their higher level tech-sector.
In order to be able to reproduce the same node position in comparative graphs lateron, we will first save the node layout (classical fruchterman-reingold layout).
coords_tech <- g_tech %>% igraph::layout.fruchterman.reingold() %>% as_tibble()
colnames(coords_tech) <- c("x", "y")
library(ggraph)
g_tech %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = sector, size = dgr)) +
geom_node_text(aes(label = field_name), size = 4, repel = TRUE) +
theme_graph()
Lets see which tech-field is the most central (complex) one:
g_tech %N>%
arrange(desc(dgr)) %>%
as_tibble() %>%
slice(1:10)
And the least one…
g_tech %N>%
arrange(dgr) %>%
as_tibble() %>%
slice(1:10)
Optimally, one would have created this space on world-data, and not only Chinese patents, though. Let’s, now that we know the overall space, take a look at city level specialization and complexity.
We here first join our city level patent data with the technology fields. Since a patent can have multiple ones, we this time fractionalize it with a field_weight
.
reg_tech <- reg_pat %>%
left_join(el_pat_tech %>% select(appln_id, techn_field_nr), by = "appln_id") %>%
group_by(appln_id) %>%
mutate(field_weight = 1 / n()) %>%
ungroup()
Now we first aggregate it on the whole time frame. We could for sure also do that sepperate for different years. We filtr our cities with
reg_tech %<>%
group_by(city, techn_field_nr) %>%
summarise(n_tech_reg = sum(field_weight)) %>%
ungroup() %>%
drop_na()
Now, we for the EconGeo
package again have to transform it to a matrix. Again, we have to set the rownames, which are depreciated in the tibble
mat_reg_tech <- reg_tech %>%
arrange(techn_field_nr, city) %>%
pivot_wider(names_from = techn_field_nr, values_from = n_tech_reg, values_fill = list(n_tech_reg = 0))
rownames(mat_reg_tech) <- mat_reg_tech %>% pull(city)
mat_reg_tech %<>% select(-city) %>%
as.matrix() %>%
round()
Lets take a look:
mat_reg_tech[1:10, 1:10]
## 1 2 3 4 5 6 7 8 9 10
## baicheng 1 0 0 0 0 0 0 0 0 0
## baoding 1 0 0 0 1 0 0 1 2 3
## baoji 3 0 0 0 0 1 0 0 0 0
## baotou 1 0 0 0 0 1 0 0 0 0
## beijing 802 1452 1202 3114 230 4807 152 1519 1143 666
## cangzhou 0 0 0 0 0 0 0 0 0 0
## changchun 9 0 0 0 0 1 0 10 3 7
## changsha 10 1 1 2 0 9 0 1 2 8
## changshu 5 4 1 0 0 1 0 1 1 0
## changzhou 20 1 0 4 0 5 0 2 1 8
Now, we can use the location.quotient()
function from EconGeo
. This function computes location quotients from (incidence) regions - industries matrices. The numerator is the share of a given industry in a given region. The denominator is the share of a this industry in a larger economy (overall country for instance). This index is also referred to as the index of Revealed Comparative Advantage (RCA) following Ballasa (1965), or the Hoover-Balassa index. To make it simple, we create a binary one, only indicating if the city has a RCA in the associated field or not. Afterwards, we transform it back to a tibble, set rownames again in a column, and gather it from a long to wide format.
reg_RCA <- mat_reg_tech %>% location.quotient(binary = TRUE) %>%
as.data.frame() %>%
rownames_to_column("city") %>%
as_tibble() %>%
gather(key = "tech_field", value = "RCA", -city) %>%
arrange(city, tech_field)
reg_RCA %>% head()
The Herfindahl()
function computes the Herfindahl index from regions - industries matrices from (incidence) regions - industries matrices. It is a measure of concentration. This index is also known as the Herfindahl-Hirschman index (Herfindahl, 1959; Hirschman, 1945).
mat_reg_tech %>%
Herfindahl() %>%
as.data.frame() %>%
rownames_to_column("city") %>%
rename(HH = ".") %>%
arrange(desc(HH)) %>%
head(10)
Next, we can compute the Shannon entropy index (Shannon and Weaver, 1949; Frenken et al., 2007) from regions - industries matrices from (incidence) regions - industries matrices.
mat_reg_tech %>%
entropy() %>%
as.data.frame() %>%
rownames_to_column("city") %>%
rename(SE = ".") %>%
arrange(desc(SE)) %>%
head(10)
Now, finally, we can inspect where in the technology space different cities are specialized. We select some of interest.
city_select <- c("beijing", "shanghai", "shenzhen")
In the following, I plot the technology-space and highlights in red the fields where the city has an RCA.
i = 1
g_tech %N>%
left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = RCA, size = dgr)) +
geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
scale_color_gradient(low = "skyblue", high = "red") +
theme_graph() +
ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))
i = 2
g_tech %N>%
left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = RCA, size = dgr)) +
geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
scale_color_gradient(low = "skyblue", high = "red") +
theme_graph() +
ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))
i = 3
g_tech %N>%
left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
ggraph(layout = coords_tech) +
geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") +
geom_node_point(aes(colour = RCA, size = dgr)) +
geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
scale_color_gradient(low = "skyblue", high = "red") +
theme_graph() +
ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))
So, what differences do we see?
You can find more info about:
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/atlas-base/atlas/libblas.so.3.0
## LAPACK: /usr/lib/atlas-base/atlas/liblapack.so.3.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Matrix_1.2-17 EconGeo_1.3 ggraph_2.0.2 tidygraph_1.1.2
## [5] magrittr_1.5 forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5
## [9] purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_2.1.3
## [13] ggplot2_3.3.0 tidyverse_1.3.0 knitr_1.28
##
## loaded via a namespace (and not attached):
## [1] ggrepel_0.8.2 Rcpp_1.0.4 lubridate_1.7.4 lattice_0.20-38
## [5] utf8_1.1.4 assertthat_0.2.1 digest_0.6.25 ggforce_0.3.1
## [9] R6_2.4.1 cellranger_1.1.0 backports_1.1.5 reprex_0.3.0
## [13] evaluate_0.14 httr_1.4.1 pillar_1.4.3 rlang_0.4.5
## [17] readxl_1.3.1 rstudioapi_0.11 rmarkdown_2.1 labeling_0.3
## [21] igraph_1.2.5 polyclip_1.10-0 munsell_0.5.0 broom_0.5.5
## [25] compiler_3.6.0 modelr_0.1.6 xfun_0.12 pkgconfig_2.0.3
## [29] htmltools_0.4.0 tidyselect_1.0.0 gridExtra_2.3 graphlayouts_0.6.0
## [33] viridisLite_0.3.0 fansi_0.4.1 crayon_1.3.4 dbplyr_1.4.2
## [37] withr_2.1.2 MASS_7.3-51.4 grid_3.6.0 nlme_3.1-139
## [41] jsonlite_1.6.1 gtable_0.3.0 lifecycle_0.2.0 DBI_1.1.0
## [45] scales_1.1.0 cli_2.0.2 stringi_1.4.6 farver_2.0.3
## [49] viridis_0.5.1 fs_1.3.2 xml2_1.2.5 ellipsis_0.3.0
## [53] generics_0.0.2 vctrs_0.2.4 tools_3.6.0 glue_1.3.2
## [57] tweenr_1.0.1 hms_0.5.3 yaml_2.2.1 colorspace_1.4-1
## [61] rvest_0.3.5 haven_2.2.0