下载新闻数据集
# 获取当前工作目录
getwd()
# 设定当前工作目录
setwd("/Users/{username}/project/")
# 载入数据,如果数据较大,推荐使用 data.table
# https://www.r-bloggers.com/efficiency-of-importing-large-csv-files-in-r/
library(data.table)
datac <- fread("page_entity_20180511.csv",sep = ',')
# 从 HTML 中提取文本
library(htm2txt)
# 重命名列名
names(datac) <- c("id", "page_title", "page_url", "page_content", "page_host_id", "created_at", "updated_at", "page_image", "page_topics", "page_date")
titles <- datac$page_title
titles <- htm2txt(titles)
titles <- gsub("n\t+", "", titles)
urls <- datac$page_url
urls <- gsub("n\t+", "", urls)
datac$page_title <- titles
datac$page_url <- urls
write.csv(file="page_entity.csv", x=datac)
# 读取另一张存有正文内容的表格
data_content = fread("page_content_full.csv", sep=',')
datam <- merge(x=data, y=dfcontent, by.x="V3", by.y="page_url", all.x=TRUE)
datacontent <- datam[datam$page_content != "NA"]
text_with_s <- grep("美国", texts(titles))
text_with_s <- grep("美国", texts(titles), value=TRUE)
install.packages("jiebaR")
library("jiebaR")
cc = worker(bylines = TRUE, stop_word = "stopwords_zh_cn.txt")
tagging = worker("tag")
tagger = worker('tag')
tokencn = segment(contents, cc)
tags = vector_tag(tokencn, tagger)
tagsns = tags[grep("^n", names(tags))]
extract = worker("keywords", topn = 10)
kws = vector_keywords(tokencn)
cal_dfm <- function(id, data) {
  datap = data[data$V5 == id]
  sizep = lengths(datap)[1]
  contents = datap$cleancontent
  tokensp = tokens(contents)
  tokencn = segment(contents, cc)
  dfmp <- dfm(tokensp, remove = stopwords("chinese"), remove_punct = TRUE)
  topfeatures(dfmp, n=20)
  dfmp
}
cal_dfm_tfidf <- function(dfm) {
  dfm_tfidf <- dfm_tfidf(dfm, scheme_tf = "prop")
  topfeatures(dfm_tfidf, n=20)
  dfm_tfidf
}
cal_ttr <- function(id, data) {
  datap = data[data$V5 == id]
  contents = datap$cleancontent
  tokensp = tokens(contents)
  trr = ntype(tokensp) / lengths(tokensp)
  hist(ttr)
  ttr
}
calfeature(1, data)
getttr(1, data)
dfm1 <- dfm(datacontent$page_content, remove = stopwords("chinese"), remove_punct = TRUE)
result <- FindTopicsNumber(
  dfm1,
  topics = seq(from = 2, to = 120, by = 2),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)
FindTopicsNumber_plot(result)
number_of_topics = 40
tm1 <- LDA(dfm1, k = number_of_topics, method = "Gibbs",  control = list(seed = 1234))
library(tidytext)
tmt1 <- tidy(tm1, matrix="beta")
terms1 <- tmt1  %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
terms1 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text = element_text(family = 'STHeiti'))
post <- posterior(tm1, dfm1)[["topics"]]
postmat <- as.data.frame(post)
postdat <- add_rownames(postmat, "doc_name")
colnames(postdat)[-1] <- apply(terms(tm1, 3), 2, paste, collapse=", ")
heatmap(post, scale="none")
group_documents <- function(topic_vec) {
  sorted <- sort(topic_vec, decreasing = T, index.return=T)
  dists <- sorted$x
  index <- sorted$ix
  max <- as.numeric(dists[1])
  if (max < 0.7)
    return(c())
  else {
    threshold <- max * 0.8
    names(dists) <- index
    return(dists[dists > threshold])
  }
}
groups <- sapply(postdat[-1], group_documents)
cal_fre <- function(ids, data) {
  datap <- data[, c("V5", "cleancontent")]
  datap <- datap[V5 %in% ids]
  all_measures <- textstat_readability(datap$cleancontent, c("Flesch", "Dale.Chall", "SMOG", "Coleman.Liau", "Fucks"))
}
all_measures <- cal_fre(c(1), data)
readability_matrix <- cbind(all_measures$Flesch, all_measures$Dale.Chall, all_measures$SMOG, all_measures$Coleman.Liau, all_measures$Fucks)
fre_core <- cor(readability_matrix)
fre_core
datap$fre_flesch <- all_measures$Flesch
datap$fre_dc <- all_measures$Dale.Chall
datap$fre_smog <- all_measures$SMOG
datap$fre_cl <- all_measures$Coleman.Liau
datap$fre_fucks <- all_measures$Fucks
hist(datap[datap$V5 == 1]$fre_flesch, xlim=c(0,120), breaks="FD")
std <- function(x) sd(x)/sqrt(length(x))
save.image("workspace.RData")