text_data_2 into a vector containing
just word “article” and respective number in each element (using
function str_extract()), then randomly change order of
elements. Finally, order output vector using function
str_order(). text_data_2 is avaliable in csv
file exercise1.Proposed answer:
library(readr)
library(stringr)
exercise1 <- read.csv("exercise1")$x
extract <- str_extract(string = exercise1, pattern = "Article \\d")
random_order <- extract[sample(1:length(extract), length(extract))]
indexes <- str_order(random_order)
random_order[indexes]
## [1] "Article 1" "Article 2" "Article 3" "Article 4" "Article 5"
text_data_2 into a vector containing
just word “article” and respective number in each element (using
function str_remove_all()), then randomly change order of
elements. Finally, order output vector using function
str_sort(). text_data_2 is avaliable in csv
file exercise1.Proposed answer:
library(readr)
library(stringr)
exercise1 <- read.csv("exercise1")$x
remove1 <- str_remove_all(string = exercise1, pattern = "\n")
remove2 <- str_remove_all(string = remove1, pattern = "(?<=(Article \\d)).{1,}")
random_order <- remove2[sample(1:length(remove2), length(remove2))]
str_sort(random_order)
## [1] "Article 1" "Article 2" "Article 3" "Article 4" "Article 5"
Proposed answer:
library(rvest)
library(stringr)
library(dplyr)
url <- "https://en.wikipedia.org/wiki/Member_states_of_the_United_Nations"
url %>%
read_html() %>%
html_node(xpath='//*[@id="mw-content-text"]/div[1]/table[2]') %>%
html_table(fill=TRUE) -> table
table <- table[,-3]
table$`Member state` <- str_remove(string = table$`Member state`, pattern = "\\[note \\d\\]")
table
## # A tibble: 193 × 2
## `Member state` `Date of admission`
## <chr> <chr>
## 1 Afghanistan 19 November 1946
## 2 Albania 14 December 1955
## 3 Algeria 8 October 1962
## 4 Andorra 28 July 1993
## 5 Angola 1 December 1976
## 6 Antigua and Barbuda 11 November 1981
## 7 Argentina 24 October 1945
## 8 Armenia 2 March 1992
## 9 Australia 1 November 1945
## 10 Austria 14 December 1955
## # … with 183 more rows
eurlex package download document with
CELEX number 32021R2282, examine term frequency and plot the results as
wordcloud. Plot 72 the most frequent words.Proposed answer:
library(eurlex)
library(textstem)
library(tidytext)
library(wordcloud2)
library(dplyr)
elx_make_query(resource_type = "regulation",
directory = "16") %>%
elx_run_query() -> dataset
dataset <- dataset[match("32021R2282", dataset$celex),]
legal.act <- elx_fetch_data(dataset$work,
type = "text",
language_1 = "en")
legal.act <- data.frame(text = legal.act)
legal.act %>%
unnest_tokens(word, text, token = "ngrams", n = 1) %>%
anti_join(stop_words) %>%
mutate(word = lemmatize_words(word)) -> legal.act
legal.act$word <- ifelse(str_remove_all(legal.act$word, "\\d") == "",
NA,
str_remove_all(legal.act$word,"\\d"))
legal.act <- na.omit(legal.act)
legal.act %>%
group_by(word) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice(1:72) -> legal.act.frequency
wordcloud2(legal.act.frequency)
eurlex package download legal acts
specified in csv file exercise5. Then examine what these
documents are about and plot the results, but plot just six the most
important words. Hint: directory is 1607.Proposed answer:
library(readr)
library(eurlex)
library(tidytext)
library(stringr)
library(ggplot2)
library(textstem)
library(dplyr)
exercise5 <- read.csv("exercise5")$x
elx_make_query(resource_type = "any",
directory = "1607") %>%
elx_run_query() -> dataset
dataset <- dataset[match(exercise5, dataset$celex),]
legal.acts <- data.frame(CELEX=dataset$celex,
text=unlist(lapply(dataset$work,elx_fetch_data,
language_1 = "en",
type = "text")))
legal.acts$text <- str_remove_all(legal.acts$text,"\\d")
legal.acts %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(word = lemmatize_words(word)) %>%
group_by(CELEX, word)%>%
summarise(n = n())%>%
arrange(desc(n)) -> act.word
act.word %>%
bind_tf_idf(word, CELEX, n) -> act.word.2
act.word.2 %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(CELEX) %>%
slice(1:6) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = CELEX)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~CELEX, ncol = 2, scales = "free") +
coord_flip()+
theme_bw()
exercise6. There
should be six groups: “Regulations”, “Decisions”, “Recommendations”,
“Opinions”, “Resolutions”, “Other documents”. Create data frame where
the first column is CELEX number and the second column is group
name.Proposed answer:
Hello, Buddy! If you thought that you should have use LDA, then sorry, you were wrong. I’m little mean. The answer is nothing more than pattern recognition. ;)
Import data to R environment.
dataset <- read.csv("exercise6")$x
dataset <- str_remove(string = dataset, pattern = "CELEX number: ")
uniques <- !duplicated(str_sub(dataset, start = 6L, end = 6L))
coding <- str_sub(dataset, start = 6L, end = 6L)
unique_codes <- coding[uniques]
Here you can find information about coding in CELEX numbers
library(rvest)
library(stringr)
library(dplyr)
url <- "https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html"
url %>%
read_html() %>%
html_node(xpath='//*[@id="content"]/div/div/div/div[3]/table') %>%
html_table(fill=TRUE) -> table
codes <- table[table$X2 %in% unique_codes,]
codes <- codes[,-1]
codes$X3 <- str_remove(string = codes$X3, pattern = " \\(with or without addressee\\)")
codes$X3 <- str_remove(string = codes$X3, pattern = " published in OJ C")
names(codes) <- c("code", "object")
codes
## # A tibble: 6 × 2
## code object
## <chr> <chr>
## 1 R Regulations
## 2 D Decisions
## 3 H Recommendations
## 4 A Opinions
## 5 G Resolutions
## 6 Y Other documents
The rest is quite easy.
decoding <- function(x)
{
return(codes$object[match(x,codes$code)])
}
answer <- data.frame(CELEX = dataset, code = coding)
answer %>%
mutate(object = decoding(code)) -> answer
answer <- answer[,-2]
answer
## CELEX object
## 1 32021R0101 Regulations
## 2 32021R0100 Regulations
## 3 32017A0608(01) Opinions
## 4 32015A1205(01) Opinions
## 5 32015A0228(01) Opinions
## 6 32014A1122(01) Opinions
## 7 32010A1222(01) Opinions
## 8 32010H0635 Recommendations
## 9 32010A0928(01) Opinions
## 10 32010A0804(01) Opinions
## 11 32010A0415(01) Opinions
## 12 32010A0414(01) Opinions
## 13 32010A0320(01) Opinions
## 14 32010A0318(01) Opinions
## 15 32009A0707(01) Opinions
## 16 32009H0120 Recommendations
## 17 32008A0509(01) Opinions
## 18 22008A0208(01) Opinions
## 19 32007A1221(01) Opinions
## 20 32007D0513 Decisions
## 21 32006D0908 Decisions
## 22 32006D0626 Decisions
## 23 32006H0040 Recommendations
## 24 32005R0302 Regulations
## 25 22002A1127(01) Opinions
## 26 32002G0522(01) Resolutions
## 27 31999Y0209(01) Other documents
## 28 31997D0873 Decisions
## 29 31996D0671 Decisions
## 30 31994D0955 Decisions
## 31 31994H0956 Recommendations
## 32 21994A0803(01) Opinions
## 33 31993R1493 Regulations
## 34 31992Y0708(02) Other documents
## 35 31992D0194 Decisions
## 36 31990D0413 Decisions
## 37 31989A0476 Opinions
## 38 31989A0354 Opinions
## 39 31989A0082 Opinions
## 40 31975Y0814(01) Other documents