text_data_2
into a vector containing
just word “article” and respective number in each element (using
function str_extract()
), then randomly change order of
elements. Finally, order output vector using function
str_order()
. text_data_2
is avaliable in csv
file exercise1
.Proposed answer:
library(readr)
library(stringr)
exercise1 <- read.csv("exercise1")$x
extract <- str_extract(string = exercise1, pattern = "Article \\d")
random_order <- extract[sample(1:length(extract), length(extract))]
indexes <- str_order(random_order)
random_order[indexes]
## [1] "Article 1" "Article 2" "Article 3" "Article 4" "Article 5"
text_data_2
into a vector containing
just word “article” and respective number in each element (using
function str_remove_all()
), then randomly change order of
elements. Finally, order output vector using function
str_sort()
. text_data_2
is avaliable in csv
file exercise1
.Proposed answer:
library(readr)
library(stringr)
exercise1 <- read.csv("exercise1")$x
remove1 <- str_remove_all(string = exercise1, pattern = "\n")
remove2 <- str_remove_all(string = remove1, pattern = "(?<=(Article \\d)).{1,}")
random_order <- remove2[sample(1:length(remove2), length(remove2))]
str_sort(random_order)
## [1] "Article 1" "Article 2" "Article 3" "Article 4" "Article 5"
Proposed answer:
library(rvest)
library(stringr)
library(dplyr)
url <- "https://en.wikipedia.org/wiki/Member_states_of_the_United_Nations"
url %>%
read_html() %>%
html_node(xpath='//*[@id="mw-content-text"]/div[1]/table[2]') %>%
html_table(fill=TRUE) -> table
table <- table[,-3]
table$`Member state` <- str_remove(string = table$`Member state`, pattern = "\\[note \\d\\]")
table
## # A tibble: 193 × 2
## `Member state` `Date of admission`
## <chr> <chr>
## 1 Afghanistan 19 November 1946
## 2 Albania 14 December 1955
## 3 Algeria 8 October 1962
## 4 Andorra 28 July 1993
## 5 Angola 1 December 1976
## 6 Antigua and Barbuda 11 November 1981
## 7 Argentina 24 October 1945
## 8 Armenia 2 March 1992
## 9 Australia 1 November 1945
## 10 Austria 14 December 1955
## # … with 183 more rows
eurlex
package download document with
CELEX number 32021R2282, examine term frequency and plot the results as
wordcloud. Plot 72 the most frequent words.Proposed answer:
library(eurlex)
library(textstem)
library(tidytext)
library(wordcloud2)
library(dplyr)
elx_make_query(resource_type = "regulation",
directory = "16") %>%
elx_run_query() -> dataset
dataset <- dataset[match("32021R2282", dataset$celex),]
legal.act <- elx_fetch_data(dataset$work,
type = "text",
language_1 = "en")
legal.act <- data.frame(text = legal.act)
legal.act %>%
unnest_tokens(word, text, token = "ngrams", n = 1) %>%
anti_join(stop_words) %>%
mutate(word = lemmatize_words(word)) -> legal.act
legal.act$word <- ifelse(str_remove_all(legal.act$word, "\\d") == "",
NA,
str_remove_all(legal.act$word,"\\d"))
legal.act <- na.omit(legal.act)
legal.act %>%
group_by(word) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice(1:72) -> legal.act.frequency
wordcloud2(legal.act.frequency)
eurlex
package download legal acts
specified in csv file exercise5
. Then examine what these
documents are about and plot the results, but plot just six the most
important words. Hint: directory is 1607.Proposed answer:
library(readr)
library(eurlex)
library(tidytext)
library(stringr)
library(ggplot2)
library(textstem)
library(dplyr)
exercise5 <- read.csv("exercise5")$x
elx_make_query(resource_type = "any",
directory = "1607") %>%
elx_run_query() -> dataset
dataset <- dataset[match(exercise5, dataset$celex),]
legal.acts <- data.frame(CELEX=dataset$celex,
text=unlist(lapply(dataset$work,elx_fetch_data,
language_1 = "en",
type = "text")))
legal.acts$text <- str_remove_all(legal.acts$text,"\\d")
legal.acts %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(word = lemmatize_words(word)) %>%
group_by(CELEX, word)%>%
summarise(n = n())%>%
arrange(desc(n)) -> act.word
act.word %>%
bind_tf_idf(word, CELEX, n) -> act.word.2
act.word.2 %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(CELEX) %>%
slice(1:6) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = CELEX)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~CELEX, ncol = 2, scales = "free") +
coord_flip()+
theme_bw()
exercise6
. There
should be six groups: “Regulations”, “Decisions”, “Recommendations”,
“Opinions”, “Resolutions”, “Other documents”. Create data frame where
the first column is CELEX number and the second column is group
name.Proposed answer:
Hello, Buddy! If you thought that you should have use LDA, then sorry, you were wrong. I’m little mean. The answer is nothing more than pattern recognition. ;)
Import data to R environment.
dataset <- read.csv("exercise6")$x
dataset <- str_remove(string = dataset, pattern = "CELEX number: ")
uniques <- !duplicated(str_sub(dataset, start = 6L, end = 6L))
coding <- str_sub(dataset, start = 6L, end = 6L)
unique_codes <- coding[uniques]
Here you can find information about coding in CELEX numbers
library(rvest)
library(stringr)
library(dplyr)
url <- "https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html"
url %>%
read_html() %>%
html_node(xpath='//*[@id="content"]/div/div/div/div[3]/table') %>%
html_table(fill=TRUE) -> table
codes <- table[table$X2 %in% unique_codes,]
codes <- codes[,-1]
codes$X3 <- str_remove(string = codes$X3, pattern = " \\(with or without addressee\\)")
codes$X3 <- str_remove(string = codes$X3, pattern = " published in OJ C")
names(codes) <- c("code", "object")
codes
## # A tibble: 6 × 2
## code object
## <chr> <chr>
## 1 R Regulations
## 2 D Decisions
## 3 H Recommendations
## 4 A Opinions
## 5 G Resolutions
## 6 Y Other documents
The rest is quite easy.
decoding <- function(x)
{
return(codes$object[match(x,codes$code)])
}
answer <- data.frame(CELEX = dataset, code = coding)
answer %>%
mutate(object = decoding(code)) -> answer
answer <- answer[,-2]
answer
## CELEX object
## 1 32021R0101 Regulations
## 2 32021R0100 Regulations
## 3 32017A0608(01) Opinions
## 4 32015A1205(01) Opinions
## 5 32015A0228(01) Opinions
## 6 32014A1122(01) Opinions
## 7 32010A1222(01) Opinions
## 8 32010H0635 Recommendations
## 9 32010A0928(01) Opinions
## 10 32010A0804(01) Opinions
## 11 32010A0415(01) Opinions
## 12 32010A0414(01) Opinions
## 13 32010A0320(01) Opinions
## 14 32010A0318(01) Opinions
## 15 32009A0707(01) Opinions
## 16 32009H0120 Recommendations
## 17 32008A0509(01) Opinions
## 18 22008A0208(01) Opinions
## 19 32007A1221(01) Opinions
## 20 32007D0513 Decisions
## 21 32006D0908 Decisions
## 22 32006D0626 Decisions
## 23 32006H0040 Recommendations
## 24 32005R0302 Regulations
## 25 22002A1127(01) Opinions
## 26 32002G0522(01) Resolutions
## 27 31999Y0209(01) Other documents
## 28 31997D0873 Decisions
## 29 31996D0671 Decisions
## 30 31994D0955 Decisions
## 31 31994H0956 Recommendations
## 32 21994A0803(01) Opinions
## 33 31993R1493 Regulations
## 34 31992Y0708(02) Other documents
## 35 31992D0194 Decisions
## 36 31990D0413 Decisions
## 37 31989A0476 Opinions
## 38 31989A0354 Opinions
## 39 31989A0082 Opinions
## 40 31975Y0814(01) Other documents