加载数据

1. 加载数据

下载新闻数据集

# 获取当前工作目录
getwd()

# 设定当前工作目录
setwd("/Users/{username}/project/")

# 载入数据，如果数据较大，推荐使用 data.table
# https://www.r-bloggers.com/efficiency-of-importing-large-csv-files-in-r/
library(data.table)

datac <- fread("page_entity_20180511.csv",sep = ',')

2. 预处理数据

# 从 HTML 中提取文本
library(htm2txt)

# 重命名列名
names(datac) <- c("id", "page_title", "page_url", "page_content", "page_host_id", "created_at", "updated_at", "page_image", "page_topics", "page_date")

titles <- datac$page_title
titles <- htm2txt(titles)
titles <- gsub("n\t+", "", titles)

urls <- datac$page_url
urls <- gsub("n\t+", "", urls)

datac$page_title <- titles
datac$page_url <- urls

write.csv(file="page_entity.csv", x=datac)

2.1 合并 dataframe

# 读取另一张存有正文内容的表格
data_content = fread("page_content_full.csv", sep=',')

datam <- merge(x=data, y=dfcontent, by.x="V3", by.y="page_url", all.x=TRUE)

datacontent <- datam[datam$page_content != "NA"]

3. 搜索关键词

text_with_s <- grep("美国", texts(titles))

text_with_s <- grep("美国", texts(titles), value=TRUE)

4. 分词处理

install.packages("jiebaR")
library("jiebaR")

cc = worker(bylines = TRUE, stop_word = "stopwords_zh_cn.txt")
tagging = worker("tag")

tagger = worker('tag')
tokencn = segment(contents, cc)

tags = vector_tag(tokencn, tagger)
tagsns = tags[grep("^n", names(tags))]

extract = worker("keywords", topn = 10)
kws = vector_keywords(tokencn)

4. 计算特征词，词义复杂度

cal_dfm <- function(id, data) {
  datap = data[data$V5 == id]

  sizep = lengths(datap)[1]

  contents = datap$cleancontent

  tokensp = tokens(contents)
  tokencn = segment(contents, cc)
  dfmp <- dfm(tokensp, remove = stopwords("chinese"), remove_punct = TRUE)
  topfeatures(dfmp, n=20)
  dfmp
}

cal_dfm_tfidf <- function(dfm) {
  dfm_tfidf <- dfm_tfidf(dfm, scheme_tf = "prop")
  topfeatures(dfm_tfidf, n=20)
  dfm_tfidf
}

cal_ttr <- function(id, data) {
  datap = data[data$V5 == id]
  contents = datap$cleancontent
  tokensp = tokens(contents)
  trr = ntype(tokensp) / lengths(tokensp)
  hist(ttr)
  ttr
}

calfeature(1, data)
getttr(1, data)

5. LDA

5.1 计算 LDA 主题数量

dfm1 <- dfm(datacontent$page_content, remove = stopwords("chinese"), remove_punct = TRUE)

result <- FindTopicsNumber(
  dfm1,
  topics = seq(from = 2, to = 120, by = 2),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)
FindTopicsNumber_plot(result)

number_of_topics = 40

5.2 计算 LDA 矩阵

tm1 <- LDA(dfm1, k = number_of_topics, method = "Gibbs",  control = list(seed = 1234))

library(tidytext)
tmt1 <- tidy(tm1, matrix="beta")

terms1 <- tmt1  %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

terms1 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text = element_text(family = 'STHeiti'))

5.3 对主题分类

post <- posterior(tm1, dfm1)[["topics"]]

postmat <- as.data.frame(post)

postdat <- add_rownames(postmat, "doc_name")

colnames(postdat)[-1] <- apply(terms(tm1, 3), 2, paste, collapse=", ")

heatmap(post, scale="none")

group_documents <- function(topic_vec) {
  sorted <- sort(topic_vec, decreasing = T, index.return=T)
  dists <- sorted$x
  index <- sorted$ix
  max <- as.numeric(dists[1])
  if (max < 0.7)
    return(c())
  else {
    threshold <- max * 0.8
    names(dists) <- index
    return(dists[dists > threshold])
  }
}

groups <- sapply(postdat[-1], group_documents)

FRE

cal_fre <- function(ids, data) {
  datap <- data[, c("V5", "cleancontent")]
  datap <- datap[V5 %in% ids]
  all_measures <- textstat_readability(datap$cleancontent, c("Flesch", "Dale.Chall", "SMOG", "Coleman.Liau", "Fucks"))
}

all_measures <- cal_fre(c(1), data)

readability_matrix <- cbind(all_measures$Flesch, all_measures$Dale.Chall, all_measures$SMOG, all_measures$Coleman.Liau, all_measures$Fucks)

fre_core <- cor(readability_matrix)
fre_core

datap$fre_flesch <- all_measures$Flesch
datap$fre_dc <- all_measures$Dale.Chall
datap$fre_smog <- all_measures$SMOG
datap$fre_cl <- all_measures$Coleman.Liau
datap$fre_fucks <- all_measures$Fucks

hist(datap[datap$V5 == 1]$fre_flesch, xlim=c(0,120), breaks="FD")

std <- function(x) sd(x)/sqrt(length(x))

保存 workspace

save.image("workspace.RData")

Josherich's Blog

1. 加载数据

2. 预处理数据

2.1 合并 dataframe

3. 搜索关键词

4. 分词处理

4. 计算特征词，词义复杂度

5. LDA

5.1 计算 LDA 主题数量

5.2 计算 LDA 矩阵

5.3 对主题分类

FRE

保存 workspace