library(magrittr) library(tm) require(plyr) require(dplyr) library(ggplot2) library(randomForest)
data_train <- read.delim("labeledTrainData.tsv",header = TRUE, sep = "\t", quote = "", stringsAsFactors = F)
paste(substr(data_train[1,3],1,700),"...") ## [1] "\"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik ..."
train_corpus <- data_train$review %>% VectorSource(.)%>% Corpus(.) %>% tm_map(., tolower) %>% tm_map(., PlainTextDocument) %>% tm_map(., removePunctuation) %>% tm_map(., removeWords, c("movie", stopwords("english"))) %>% tm_map(., stemDocument)
frequencies <- DocumentTermMatrix(train_corpus) frequencies ## <<DocumentTermMatrix (documents: 25000, terms: 92244)>> ## Non-/sparse entries: 2387851/2303712149 ## Sparsity : 100% ## Maximal term length: 64 ## Weighting : term frequency (tf)
sparse <- removeSparseTerms(frequencies, 0.95) sparse ## <<DocumentTermMatrix (documents: 25000, terms: 373)>> ## Non-/sparse entries: 1046871/8278129 ## Sparsity : 89% ## Maximal term length: 10 ## Weighting : term frequency (tf)
reviewSparse = as.data.frame(as.matrix(sparse)) vocab <- names(reviewSparse) reviewSparse$sentiment <- data_train$sentiment %>% as.factor(.) %>% revalue(., c("0"="neg", "1" = "pos")) row.names(reviewSparse) <- NULL
model_rf <- randomForest(sentiment ~ ., data = reviewSparse, ntree = 100)
data_test <- read.delim("testData.tsv", header = TRUE, sep = "\t", quote = "", stringsAsFactors = F) test_corpus <- data_test$review %>% VectorSource(.)%>% Corpus(.) %>% tm_map(., tolower) %>% tm_map(., PlainTextDocument) %>% tm_map(., removePunctuation) %>% tm_map(., removeWords, c("movie", stopwords("english"))) %>% tm_map(., stemDocument) test_frequencies <- DocumentTermMatrix(test_corpus,control=list(dictionary = vocab)) reviewSparse_test <- as.data.frame(as.matrix(test_frequencies)) row.names(reviewSparse_test) <- NULL sentiment_test <- predict(model_rf, newdata = reviewSparse_test) pred_test <- as.data.frame(cbind(data_test$id, sentiment_test)) colnames(pred_test) <- c("id", "sentiment") pred_test$sentiment %<>% revalue(., c("1"="0", "2" = "1")) write.csv(pred_test, file="Submission.csv", quote=FALSE, row.names=FALSE)
freq_neg <- data_train %>% filter(sentiment == 0) %>% select(review) %>% VectorSource(.)%>% Corpus(.) %>% tm_map(., tolower) %>% tm_map(., PlainTextDocument) %>% tm_map(., removePunctuation) %>% tm_map(., removeNumbers) %>% tm_map(., removeWords, c(stopwords("english"))) %>% tm_map(., stemDocument) %>% DocumentTermMatrix(.) %>% removeSparseTerms(., 0.999) %>% as.matrix(.) freq_df_neg <- colSums(freq_neg) freq_df_neg <- data.frame(word = names(freq_df_neg), freq = freq_df_neg) rownames(freq_df_neg) <- NULL head(arrange(freq_df_neg, desc(freq))) ## word freq ## 1 movi 27800 ## 2 film 21900 ## 3 one 12959 ## 4 like 12001 ## 5 just 10539 ## 6 make 7846
freq_pos <- data_train %>% filter(sentiment == 1) %>% select(review) %>% VectorSource(.)%>% Corpus(.) %>% tm_map(., tolower) %>% tm_map(., PlainTextDocument) %>% tm_map(., removePunctuation) %>% tm_map(., removeNumbers) %>% tm_map(., removeWords, c(stopwords("english"))) %>% tm_map(., stemDocument) %>% DocumentTermMatrix(.) %>% removeSparseTerms(., 0.999) %>% as.matrix(.) freq_df_pos <- colSums(freq_pos) freq_df_pos <- data.frame(word = names(freq_df_pos), freq = freq_df_pos) rownames(freq_df_pos) <- NULL head(arrange(freq_df_pos, desc(freq))) ## word freq ## 1 film 24398 ## 2 movi 21796 ## 3 one 13706 ## 4 like 10138 ## 5 time 7889 ## 6 good 7508
freq_all <- merge(freq_df_neg, freq_df_pos, by = "word", all = T) freq_all$freq.x[is.na(freq_all$freq.x)] <- 0 freq_all$freq.y[is.na(freq_all$freq.y)] <- 0 freq_all$diff <- abs(freq_all$freq.x - freq_all$freq.y) head(arrange(freq_all, desc(diff))) ## word freq.x freq.y diff ## 1 movi 27800 21796 6004 ## 2 bad 7660 1931 5729 ## 3 great 2692 6459 3767 ## 4 just 10539 7109 3430 ## 5 love 2767 5988 3221 ## 6 even 7707 5056 2651
freq_all$diff_norm <- abs(freq_all$freq.x - freq_all$freq.y)/ (freq_all$freq.x +freq_all$freq.y + 300) head(arrange(freq_all, desc(diff_norm))) ## word freq.x freq.y diff diff_norm ## 1 worst 2436 246 2190 0.7344064 ## 2 wast 1996 192 1804 0.7250804 ## 3 horribl 1189 194 995 0.5912062 ## 4 stupid 1525 293 1232 0.5816808 ## 5 bad 7660 1931 5729 0.5792134 ## 6 wors 1183 207 976 0.5775148
freq_word <- arrange(freq_all, desc(diff_norm)) %>% select(word) %>% slice(1:500)
vocab <- as.character(freq_word$word) frequencies = DocumentTermMatrix(train_corpus,control=list(dictionary = vocab)) reviewSparse_train <- as.data.frame(as.matrix(frequencies)) row.names(reviewSparse_train) <- NULL reviewSparse_train$sentiment <- data_train$sentiment %>% as.factor(.) %>% revalue(., c("0"="neg", "1" = "pos")) model_rf <- randomForest(sentiment ~ ., data = reviewSparse_train, ntree = 100)
data_train_un <- read.delim("unlabeledTrainData.tsv",header = TRUE, sep = "\t", quote = "", stringsAsFactors = F) train_review <- c(data_train$review, data_train_un$review) train_corpus <- train_review %>% VectorSource(.)%>% Corpus(.) %>% tm_map(., tolower) %>% tm_map(., PlainTextDocument) %>% tm_map(., removePunctuation) %>% tm_map(., removeNumbers) %>% tm_map(., removeWords, c(stopwords("english"))) %>% tm_map(., stemDocument) tdm <- TermDocumentMatrix(train_corpus, control = list(weighting = function(x) weightTfIdf(x, normalize = F))) library(slam) freq <- rollup(tdm, 2,FUN = sum) freq <- as.matrix(freq) freq_df <- data.frame(word = row.names(freq), tfidf = freq) names(freq_df) <- c("word", "tf_idf") row.names(freq_df) <- NULL freq_df %<>% arrange(desc(tf_idf)) vocab <- as.character(freq_df$word)[1:500] train_corpus <- data_train$review %>% VectorSource(.)%>% Corpus(.) %>% tm_map(., tolower) %>% tm_map(., PlainTextDocument) %>% tm_map(., removePunctuation) %>% tm_map(., removeNumbers) %>% tm_map(., removeWords, c(stopwords("english"))) %>% tm_map(., stemDocument) frequencies = DocumentTermMatrix(train_corpus,control=list(dictionary = vocab, weighting = function(x) weightTfIdf(x, normalize = F) )) reviewSparse_train <- as.data.frame(as.matrix(frequencies)) rm(data_train_un, tdm, dtm, train_review) reviewSparse_train <- as.data.frame(as.matrix(frequencies)) row.names(reviewSparse_train) <- NULL colnames(reviewSparse_train) = make.names(colnames(reviewSparse_train)) reviewSparse_train$sentiment <- data_train$sentiment %>% as.factor(.) %>% revalue(., c("0"="neg", "1" = "pos")) rm(data_train, train_corpus, freq, freq_df) model_rf <- randomForest(sentiment ~ ., data = reviewSparse_train, ntree = 100)
Source: https://habr.com/ru/post/270591/
All Articles