### Load packages  
# Standard
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)

# Network specific
library(tidygraph) # For tidy-style graph manipulation
library(ggraph) # For ggplot2 style graph plotting

# Geocoding
library(ggmap)

A closer look at the Chinese patenting activity

Loading and understanding the data structure

So, let’s start the fun. I for you extracted Chinese patents from our EPO PATSTAT databases filed at either the EPO or the USTPO. I further provide you adittional data. Lets take a look:

patents <- readRDS("data/CN_patent.rds")
inventors <- readRDS("data/CN_inventor.rds") %>% filter(str_detect(city, "china")) %>% mutate(city = city %>% str_remove("\\,.*")) 
el_pat_inv <- readRDS("data/CN_el_inventor_patent.rds") 
el_pat_tech <- readRDS("data/CN_el_patent_field.rds")
patents %>% glimpse()
## Observations: 66,266
## Variables: 5
## $ appln_id            <int> 53568634, 53699173, 15765821, 48003535, 323548512…
## $ appln_filing_year   <int> 2001, 2001, 2000, 2002, 2010, 2003, 2002, 2002, 2…
## $ docdb_family_size   <int> 6, 13, 5, 7, 20, 11, 5, 5, 4, 8, 4, 10, 4, 4, 5, …
## $ nb_citing_docdb_fam <int> 14, 73, 43, 27, 81, 57, 7, 17, 1, 5, 29, 1, 1, 2,…
## $ nb_inventors        <int> 5, 5, 5, 1, 4, 2, 1, 1, 4, 6, 5, 1, 1, 2, 1, 2, 1…

This main dataset contains all Patents in the 2000-2015 period with Chinese inventors, filed at the USTPO or EPO. I only included priority (earliest) patent applications which got granted up to now. We have the following variables:

  • appln_id: PATSTAT id, unique identifier of patent application
  • appln_filing_year: Filing year of first priority
  • docdb_family_size: Size of the (simple) patent family
  • nb_citing_docdb_fam: Number of citations recieved by the patent family
  • nb_inventors: Number of inventors
inventors %>% glimpse()
## Observations: 82,678
## Variables: 5
## $ person_id <int> 4932, 5805, 5960, 6375, 6377, 6378, 6396, 6745, 6757, 7591,…
## $ psn_name  <chr> "XU, KAIHUA", "WANG, WEI", "DU, BIN", "TIAN, GUANGHUI", "WA…
## $ city      <chr> "shenzhen", "shenzhen", "shenzhen", "shanghai", "shanghai",…
## $ lon       <dbl> 114.06, 114.06, 114.06, 121.47, 121.47, 121.47, 117.12, 114…
## $ lat       <dbl> 22.54, 22.54, 22.54, 31.23, 31.23, 31.23, 36.65, 22.54, 22.…

This are all Chinese inventors indicated on the selected patents. Foreign inventors are excluded. We enriched this data with an geocoding exercise. It up to now managed to match only around 75% of all inventors, new and better version in the making.

  • person_id: Unique inventor identifyer
  • psn_name: OECD HAN (harminized) inventor name
  • city: Inventors city, extracted with own adressparsing (not 100%)
  • lon: City longitude (own geocoding, not 100%)
  • lat: City lattitude (own geocoding, not 100%)
el_pat_inv %>% glimpse()
## Observations: 168,559
## Variables: 3
## $ appln_id    <int> 1790, 1790, 1790, 1800, 1800, 1800, 1800, 1815, 1815, 181…
## $ person_id   <int> 56646827, 56799439, 56629287, 55622980, 49131108, 5562826…
## $ invt_seq_nr <int> 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 1, 1, 2, 3, 4, 5, 6, 7, 1, …

Edgeliste, matching patents with inventors.

el_pat_tech %>% glimpse()
## Observations: 103,096
## Variables: 3
## $ appln_id       <int> 1790, 1790, 1800, 1815, 1823, 1823, 1823, 2052, 2062, …
## $ techn_field_nr <int> 3, 4, 3, 3, 20, 23, 24, 4, 3, 2, 4, 3, 16, 17, 4, 4, 2…
## $ weight         <dbl> 0.666667, 0.333333, 1.000000, 1.000000, 0.200000, 0.60…

Edgeliste, matching patent with technology field.

Technology fields represent a higher level of aggregation compared to IPC classes, and classify technologies into 35 fields. Patents can be assigned to multiple fields simultaneously. Since the raw number is not very informative, lets get a field description.

field_names <- read_csv("data/ipc_technology.csv") %>%
  select(field_nr, sector, field_name) %>%
  distinct(field_nr, .keep_all = TRUE) %>%
  mutate(field_nr = field_nr) %>%
  arrange(field_nr)
field_names %>% distinct(field_name)

On a even higher level, these fields are classified in 5 sectors.

field_names %>% distinct(sector)

First investigation

Alright, lets take a first look at the development of chinese patents over time.

patents %>%
  group_by(appln_filing_year) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = appln_filing_year, y = n)) +
  geom_line()

Ok, we see that patent applications sharply increased over time… We see somewhat a decline after 2013, but I would suggest the reason to be a lag in reporting.

Lets look at the sectoral split.

patents_field <- patents %>%
  left_join(el_pat_tech, by = "appln_id") %>%
  left_join(field_names, by = c("techn_field_nr" = "field_nr")) 
patents_field %>%
  group_by(appln_filing_year, sector) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = appln_filing_year, y = n, col = sector)) +
  geom_line()

It seems as if electrical engineering leads by far…

The Geography of Chinese patents

Data munging

First, we will create a list of unique cities including their geolocation.

cities_geo <- inventors %>%
  distinct(city, .keep_all = TRUE) %>%
  select(city, lon, lat)

Now, lets create a dataset of patents including their applicants and geolocation. To make it easier, we will only look at the first applicants in the invt_seq_nr, and discard the rest (alternatively, we could fractionalize them).

reg_pat <- el_pat_inv %>%
  arrange(appln_id, invt_seq_nr) %>%
  distinct(appln_id, .keep_all = TRUE) %>%
  left_join(inventors %>% select(person_id, city), by = "person_id") %>%
  left_join(patents %>% select(appln_id, appln_filing_year, nb_citing_docdb_fam), by = "appln_id") %>%
  select(-invt_seq_nr, person_id) 

Top patenting cities

Lets see which cities account for the most patents. Any guesses?

top_cities <- reg_pat %>%
  group_by(city) %>%
  summarise(n = n(),
            n.cit = sum(nb_citing_docdb_fam)) %>%
  arrange(desc(n)) %>%
  top_n(10, n)
top_cities

Activity over time

Lets see how the cities developed over time.

reg_pat %>%
  group_by(city, appln_filing_year) %>%
  summarise(n = n(),
            n.cit = sum(nb_citing_docdb_fam)) %>%
  arrange(desc(n)) %>%
  filter(city %in% (top_cities %>% pull(city))) %>%
  ggplot(aes(x = appln_filing_year, y = n, col = city)) +
  geom_line()

Geoplotting

Lets see how that on aggregate looks on a map. Therefore, we first need some map-data for china. In the lasts ession I showed you how to create a simpel geoplot with border-shape files. This time we go a step further by downloading finer data directly from map applications, I use the excellent ggmap package for that, but it uses a google back-end and needs a developer-id (problematic here in China). While it can be still run here in the RStudioCloud (US server), it requires you to get an GoogleMaps developer account. If you dont want to do that, you can still use some of the functionalities (getting maps), while others (geocoding) will need a developer authentification to use the Google API.

Note: I figured out that Baidu has an almost equal package, so you might also explore that.

# https://gist.github.com/graydon/11198540
cn <- c(left = 73.6753792663, bottom = 18.197700914, right = 135.026311477, top = 53.4588044297)
cn_map <- get_stamenmap(cn, zoom = 5, maptype = "toner-lite") # Stamenmap is another provider of open source maps, so we dont need google here
cn_map %>% ggmap() 

Now, we just need to plot the cities in, scaled by patenting activity.

reg_pat_agg <- reg_pat %>%
  group_by(city) %>%
  summarise(n = n(),
            n.cit = sum(nb_citing_docdb_fam)) %>%
  left_join(cities_geo, by = "city") %>%
  drop_na()
cn_map %>% ggmap() +
  geom_point(data = reg_pat_agg, 
             aes(x = lon, y = lat, size = n, alpha = 0.5)) 

Ok, we can even do better, and add a nice density layer on top.

cn_map %>% 
  ggmap() +
  stat_density2d(data = reg_pat_agg, 
                 aes(x = lon, y = lat, fill = stat(nlevel), col = stat(nlevel) ), alpha = 0.2, size = 0.2, bins = 10, geom = "polygon") +
  scale_fill_gradient(low = "skyblue", high = "red") +
  scale_color_gradient(low = "skyblue", high = "red")

Neath, isn’t it? Here we really see where Chinese patenting activity agglomorates.

Endnotes

References

More info

You can find more info about:

Patent Data

R packages used

Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/atlas-base/atlas/libblas.so.3.0
## LAPACK: /usr/lib/atlas-base/atlas/liblapack.so.3.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggmap_3.0.0     ggraph_2.0.2    tidygraph_1.1.2 magrittr_1.5   
##  [5] forcats_0.5.0   stringr_1.4.0   dplyr_0.8.5     purrr_0.3.3    
##  [9] readr_1.3.1     tidyr_1.0.2     tibble_2.1.3    ggplot2_3.3.0  
## [13] tidyverse_1.3.0 knitr_1.28     
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.5.1       httr_1.4.1          jsonlite_1.6.1     
##  [4] viridisLite_0.3.0   modelr_0.1.6        assertthat_0.2.1   
##  [7] sp_1.4-1            cellranger_1.1.0    yaml_2.2.1         
## [10] ggrepel_0.8.2       pillar_1.4.3        backports_1.1.5    
## [13] lattice_0.20-38     glue_1.3.2          digest_0.6.25      
## [16] polyclip_1.10-0     rvest_0.3.5         colorspace_1.4-1   
## [19] htmltools_0.4.0     plyr_1.8.6          pkgconfig_2.0.3    
## [22] broom_0.5.5         haven_2.2.0         scales_1.1.0       
## [25] tweenr_1.0.1        jpeg_0.1-8.1        ggforce_0.3.1      
## [28] generics_0.0.2      farver_2.0.3        ellipsis_0.3.0     
## [31] withr_2.1.2         cli_2.0.2           crayon_1.3.4       
## [34] readxl_1.3.1        evaluate_0.14       fs_1.3.2           
## [37] fansi_0.4.1         nlme_3.1-139        MASS_7.3-51.4      
## [40] xml2_1.2.5          tools_3.6.0         hms_0.5.3          
## [43] RgoogleMaps_1.4.5.3 lifecycle_0.2.0     munsell_0.5.0      
## [46] reprex_0.3.0        isoband_0.2.0       compiler_3.6.0     
## [49] rlang_0.4.5         grid_3.6.0          rstudioapi_0.11    
## [52] rjson_0.2.20        igraph_1.2.5        bitops_1.0-6       
## [55] labeling_0.3        rmarkdown_2.1       gtable_0.3.0       
## [58] DBI_1.1.0           curl_4.3            graphlayouts_0.6.0 
## [61] R6_2.4.1            gridExtra_2.3       lubridate_1.7.4    
## [64] utf8_1.1.4          stringi_1.4.6       Rcpp_1.0.4         
## [67] vctrs_0.2.4         png_0.1-7           dbplyr_1.4.2       
## [70] tidyselect_1.0.0    xfun_0.12