### Install and oad packages if necessary
pacman::p_load(tidyverse,
magrittr,
tidymodels
)
Welcome to todays session.
patents <- readRDS(url("https://github.com/daniel-hain/ML_course_maastricht/raw/master/data/CN_patent.rds?raw=true"))
pat_abstr <- readRDS(url("https://github.com/daniel-hain/ML_course_maastricht/raw/master/data/CN_el_patent_abstract.rds?raw=true"))
pat_cpc <- readRDS(url("https://github.com/daniel-hain/ML_course_maastricht/raw/master/data/CN_el_cpc.rds?raw=true"))
patents %<>% filter(appln_filing_year >= 2013)
pat_abstr %<>% semi_join(patents, by = 'appln_id')
pat_cpc %<>% semi_join(patents, by = 'appln_id')
patents %>% head()
patents %>% glimpse()
Rows: 23,254
Columns: 5
$ appln_id <int> 447532870, 410465871, 410460263, 418875468, 416430384, 420520135, 438136257, 408895739, 417619658, 423833383, 420883518, 46…
$ appln_filing_year <int> 2015, 2013, 2013, 2014, 2013, 2014, 2014, 2013, 2014, 2013, 2014, 2014, 2014, 2014, 2014, 2015, 2015, 2015, 2014, 2013, 201…
$ docdb_family_size <int> 20, 19, 8, 18, 4, 17, 13, 10, 11, 5, 5, 12, 13, 4, 11, 11, 7, 2, 9, 3, 9, 23, 10, 12, 15, 20, 25, 20, 20, 7, 19, 19, 30, 17…
$ nb_citing_docdb_fam <int> 61, 17, 125, 0, 36, 21, 21, 8, 5, 10, 64, 44, 36, 7, 75, 33, 17, 28, 40, 5, 123, 8, 4, 70, 16, 32, 31, 16, 11, 10, 5, 11, 1…
$ nb_inventors <int> 3, 6, 9, 2, 4, 4, 2, 1, 2, 3, 2, 6, 2, 2, 4, 4, 6, 4, 5, 3, 4, 7, 2, 4, 4, 3, 4, 3, 4, 3, 6, 5, 5, 4, 3, 3, 5, 9, 10, 1, 10…
This main dataset contains all Patents in the 2000-2015 period with Chinese inventors, filed at the USTPO or EPO. I only included priority (earliest) patent applications which got granted up to now. We have the following variables:
appln_id
: PATSTAT id, unique identifier of patent applicationappln_filing_year
: Filing year of first prioritydocdb_family_size
: Size of the (simple) patent familynb_citing_docdb_fam
: Number of citations recieved by the patent familynb_inventors
: Number of inventorspatents %>% skimr::skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 23254
Number of columns 5
_______________________
Column type frequency:
numeric 5
________________________
Group variables None
── Variable type: numeric ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 appln_id 0 1 441983572. 20224263. 380196462 422944781. 443529288. 451974322. 497361534 ▁▇▇▅▂
2 appln_filing_year 0 1 2014. 0.796 2013 2013 2014 2015 2015 ▇▁▇▁▆
3 docdb_family_size 0 1 3.79 3.47 1 2 3 4 195 ▇▁▁▁▁
4 nb_citing_docdb_fam 0 1 3.45 7.58 0 0 2 4 365 ▇▁▁▁▁
5 nb_inventors 0 1 3.19 2.21 1 2 3 4 30 ▇▁▁▁▁
pat_cpc %>% glimpse()
Rows: 150,944
Columns: 2
$ appln_id <int> 380196462, 380277821, 380277821, 380277821, 380277821, 380277821, 380277821, 380331159, 380331159, 380331159, 380331159, 38033…
$ cpc_class_symbol <chr> "B66B 23/22", "A61B 6/0407", "A61B 6/102", "A61B 6/4283", "A61B 6/4452", "F16M 13/022", "G03B 42/025", "F16J 15/06…
pat_cpc %>% head()
pat_cpc %>%
count(cpc_class_symbol, sort = TRUE) %>%
head(10)
pat_abstr %>% glimpse()
Rows: 22,694
Columns: 2
$ appln_id <int> 380196462, 380277821, 380331159, 380487873, 380565640, 380623547, 380646500, 380646539, 380717584, 380717586, 380717588, 3807528…
$ appln_abstract <chr> "An easily mounting and dismantling outer decking apparatus for an escalator or moving walkway is provided, which comprises: a c…
pat_abstr %>% select(-appln_id) %>% head(3)
We could have so much fun here exploring Chinese patents, but we have no time. However, I do another more exploratory lecture on the ame dataset, feel free to check:
We now aim at identifying renewable energy patents. This could be the starting point for an interesting analysis on all kind of things, but we here went to ask the following question:
Y02
which helps us to easily identify them.Lets identify renewable energy patents.
y_tag <- pat_cpc %>%
filter(cpc_class_symbol %>% str_starts('Y02')) %>%
distinct(appln_id) %>%
pull()
patents %<>%
mutate(y_tag = appln_id %in% y_tag)
patents %>% head()
rm(pat_cpc)
Most language analysis approaches are based on the analysis of texts word-by-word. Here, their order might matter (word sequence models) or not (bag-of-words models), but the smallest unit of analysis is usually the word. This is usually done in context of the document the word appeared in. Therefore, on first glance three types datastructures make sense:
tidytext
)tm
, quanteda
)Different forms of analysis (and the packages used therefore) favor different structures, so we need to be fluent in transfering original raw-text in these formats, as well as switching between them. (for more infos, check here).
library(tidytext)
pat_abstr_tidy <- pat_abstr %>%
unnest_tokens(output = word,
input = appln_abstract,
token = "words",
to_lower = TRUE,
drop = TRUE)
pat_abstr_tidy %<>%
mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>%
filter(str_length(word) > 2 ) %>%
group_by(word) %>%
filter(n() > 100) %>%
ungroup() %>%
anti_join(get_stopwords())
Joining, by = "word"
pat_abstr_tidy %<>%
add_count(appln_id, word) %>%
bind_tf_idf(term = word,
document = appln_id,
n = n)
pat_abstr_tidy %<>%
left_join(patents %>% select(appln_id, y_tag), by = "appln_id") %>%
relocate(appln_id, y_tag)
pat_abstr_tidy %>%
head()
pat_abstr_tidy %>%
count(word, wt = tf_idf, sort = TRUE) %>%
head(50)
pat_ytag_words <- pat_abstr_tidy %>%
group_by(y_tag) %>%
count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
slice(1:20) %>%
ungroup() %>%
mutate(word = reorder_within(word, by = tf_idf, within = y_tag))
pat_ytag_words %>%
ggplot(aes(x = word, y = tf_idf, fill = y_tag)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~y_tag, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
# only a small sample for illustration
pat_dtm <- pat_abstr_tidy %>%
select(appln_id, y_tag, word, tf_idf) %>%
head(1000) %>%
distinct(appln_id, word, .keep_all = TRUE)
# Now we just pivot wider
pat_dtm %<>%
pivot_wider(names_from = word, values_from = tf_idf, names_prefix = 'word_', values_fill = 0)
pat_dtm %>% head()
rm(pat_dtm)
data <- pat_abstr %>%
inner_join(patents %>% select(appln_id, y_tag), by = "appln_id") %>%
select(y_tag, appln_abstract) %>%
rename(y = y_tag, text = appln_abstract) %>%
mutate(y = y %>% as_factor()) %>%
mutate(text = text %>% str_to_lower() %>% str_remove_all('[^[:alnum:] ]') %>% str_squish()) %>%
drop_na()
data %>% skimr::skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 22694
Number of columns 2
_______________________
Column type frequency:
character 1
factor 1
________________________
Group variables None
── Variable type: character ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 text 0 1 22 1981 0 22615 0
── Variable type: factor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate ordered n_unique top_counts
1 y 0 1 FALSE 2 FAL: 21302, TRU: 1392
set.seed(1337)
data_split <- initial_split(data, prop = 0.75, strata = y)
data_train <- data_split %>% training()
data_test <- data_split %>% testing()
library(textrecipes)
data_recipe <- data_train %>%
recipe(y ~.) %>%
themis::step_downsample(y) %>% # due to class imbalances%>%
step_filter(text != "") %>%
# How textrecipes start
step_tokenize(text) %>% # Tokenizing
step_tokenfilter(text, min_times = 100) %>% # filter sparse terms
step_stopwords(text, keep = FALSE) %>% # Filter stopwords
step_tfidf(text) %>% # tfidf weighting
# step_knnimpute(all_predictors()) %>% # knn inputation of missing values Not necessary here
prep() # !!! NOTE: Only prep() the recipe if you dont want to use it in a workflow, otherwise there might b ussues
data_recipe
Recipe
Inputs:
Training data contained 17020 data points and no missing data.
Operations:
Down-sampling based on y [trained]
Row filtering [trained]
Tokenization for text [trained]
Text filtering for text [trained]
Stop word removal for text [trained]
Term frequency-inverse document frequency with text [trained]
data_train_prep <- data_recipe %>% juice()
data_test_prep <- data_recipe %>% bake(data_test)
# # Use this code in case you want to do parallel processing
# library(doParallel)
# all_cores <- parallel::detectCores(logical = FALSE)
# cl <- makePSOCKcluster(all_cores -1)
#registerDoParallel(cl)
model_en <- logistic_reg(mode = 'classification',
mixture = 0.25,
penalty = 0.25) %>%
set_engine('glm', family = binomial())
We will skip the workflow step this time, since we do not evaluate different models against each others.
fit_en <- model_en %>% fit(formula = y ~., data = data_train_prep)
pred_collected <- tibble(
truth = data_train_prep %>% pull(y),
pred = fit_en %>% predict(new_data = data_train_prep) %>% pull(.pred_class),
pred_prob = fit_en %>% predict(new_data = data_train_prep, type = "prob") %>% pull(.pred_TRUE),
)
pred_collected %>% conf_mat(truth, pred)
Truth
Prediction FALSE TRUE
FALSE 792 363
TRUE 253 682
pred_collected %>% conf_mat(truth, pred) %>% autoplot(type = 'heatmap')
pred_collected %>% conf_mat(truth, pred) %>% summary()
Machine learning (ML) models are often considered black boxes due to their complex inner-workings. More advanced ML models such as random forests, gradient boosting machines (GBM), artificial neural networks (ANN), among others are typically more accurate for predicting nonlinear, faint, or rare phenomena. Unfortunately, more accuracy often comes at the expense of interpretability, and interpretability is crucial for business adoption, model documentation, regulatory oversight, and human acceptance and trust. Luckily, several advancements have been made to aid in interpreting ML models.
Moreover, it’s often important to understand the ML model that you’ve trained on a global scale, and also to zoom into local regions of your data or your predictions and derive local explanations.
Finally, let’s get a feeling what variables the models mostly draw from. There are numerous ways for such inspections, in which we will just scratch the surface here. Here, I just want to present the most intuitive and common one, Variable Importance.
Most (but not all) model classes offer some possibility to derive measures of variable importance. Note, this is currently not implemented for SVMs. Again, in most ML models and setups, these measures are nice to give a rough intuition, but CANNOT be interpreted as constant marginal effects, left alone causal.
library(vip)
Attaching package: ‘vip’
The following object is masked from ‘package:utils’:
vi
vip(fit_en) + ggtitle("VarImp: Elastic Net")
In complex nonparametric models, it is often more helpful to get explanations why a certain datapoint is classified in the way it is than to look at the overall importance of variables.
Local Interpretable Model-agnostic Explanations (LIME) is a visualization technique that helps explain individual predictions. As the name implies, it is model agnostic so it can be applied to any supervised regression or classification model. There is a neath R
implementation in the lime
package. If you want to investigate further, feel free! Otherwise, keep in mind this exists, we will come back to it again in later lectures.
BTW: The original paper is mindblowing, if you find time, just read it!
library(lime)
Attaching package: ‘lime’
The following object is masked from ‘package:dplyr’:
explain
# Create an explainer object
explainer <- lime(data_train_prep, fit_en)
# Explain new observation
explanation <- explain(data_train_prep %>% slice(1:6), explainer, n_labels = 1, n_features = 5)
# The output is provided in a consistent tabular format and includes the output from the model.
explanation %>% head()
explanation %>% plot_features()
text_own = tibble(text = 'This device is going to save the world that includes a portion of solar photovoltaic power to control bio enriched energy saviung algae that produces a highly energy body state that can be used for mobile energy production.')
fit_en %>% predict(new_data = data_recipe %>% bake(text_own))
text_cordis <- read_csv('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/cordis-h2020reports.gz')
New names:
* `` -> ...1
Rows: 500 Columns: 15
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): language, title, teaser, summary, workPerformed, finalResults, projectAcronym, programme, topics, url
dbl (3): ...1, rcn, projectID
lgl (1): country
dttm (1): lastUpdateDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pred_cordis <- tibble(pred = fit_en %>% predict(new_data = data_recipe %>% bake(text_cordis %>% rename(text = summary) %>% select(text))) %>% pull(.pred_class),
pred_prob = fit_en %>% predict(new_data = data_recipe %>% bake(text_cordis %>% rename(text = summary) %>% select(text)), type = "prob") %>% pull(.pred_TRUE))
text_cordis %>%
bind_cols(pred_cordis) %>%
filter(pred == 'TRUE') %>%
arrange(desc(pred_prob)) %>%
select(projectAcronym, title, pred_prob) %>%
head(50)
tidymodels
: Tidy statistical and predictive modeling ecosystemtidytext
: Tidy text analysis in R ecosystemtextrecipes
: Preprocessing workflows for text datasessionInfo()