# Load Packages
suppressPackageStartupMessages({
library(magrittr)
library(DBI)
library(data.table); options(datatable.print.class = TRUE)
library(RSQLite)
# library(tidytext)
library(tokenizers)
library(text2vec)
library(ggplot2)
library(plotly)
library(jiebaR)
library(stringr)
# devtools::install_github("bmschmidt/wordVectors")
library(wordVectors)
library(wordcloud2)
# library(tsne)
library(dbscan)
library(factoextra)
})
source("R/utils.R", encoding = "UTF-8")
source("R/filter_dtm.R", encoding = "UTF-8")
con <- dbConnect(RSQLite::SQLite(), "./dataset/db.sqlite")
dt <- dbReadTable(con, "Gossiping") %>% setDT
dbDisconnect(con)
# dt[sample(seq(.N), 5)] %>% View # take a look
# Remove reply text and messages
dt[title %>% str_detect("^Re:"),
`:=`(post_text = post_text %>%
str_replace_all('(?m)^(:|※).*$', "") %>%
str_replace_all('(?im)(posted|sent) from.*$', ""))]
# Remove URLs
url_regex = 'http[s]?://(?:[a-zA-Z0-9$-_@.&+!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+(?:#[-A-z0-9]+)?'
dt[, `:=`(post_text = post_text %>% str_replace_all(url_regex, ""),
push_text = push_text %>% str_replace_all(url_regex, ""))]
# Remove all but Eng and Zht
dt[, `:=`(post_text = post_text %>%
str_replace_all("[^\u4e00-\u9fa5A-Za-z\\d\\n\\s]+", " "),
push_text = push_text %>%
str_replace_all("[^\u4e00-\u9fa5A-Za-z\\d\\n\\s]+", " "))]
# Filter
dt <- dt[! (str_detect(title, "置底|公告") | str_detect(post_text, "置底|公告"))]
dt[sample(seq(.N), 5)] %>% View # take a look again
# 起手式,結巴建立斷詞器
mix_seg <- worker(type = "mix",
dict = "dict/jieba_dict_utf8.txt",
stop_word = "dict/stop_utf8.txt",
symbol = FALSE,
encoding = "UTF-8")
# tokenize
text_seg <- dt[, paste(post_text, push_text)] %>% lapply(cutter, mix_seg)
segment("中華電信499吃到飽之亂引爆戰火", mix_seg)
#> [1] "中華電信" "499" "吃" "飽" "之亂" "引爆"
#> [7] "戰火"
x <- "中華電信499吃到飽之亂引爆戰火"
tokenize_character_shingles(x, n = 4, n_min = 1, simplify = TRUE)
#> [1] "中" "中華" "中華電" "中華電信" "華" "華電"
#> [7] "華電信" "華電信4" "電" "電信" "電信4" "電信49"
#> [13] "信" "信4" "信49" "信499" "4" "49"
#> [19] "499" "499吃" "9" "99" "99吃" "99吃到"
#> [25] "9" "9吃" "9吃到" "9吃到飽" "吃" "吃到"
#> [31] "吃到飽" "吃到飽之" "到" "到飽" "到飽之" "到飽之亂"
#> [37] "飽" "飽之" "飽之亂" "飽之亂引" "之" "之亂"
#> [43] "之亂引" "之亂引爆" "亂" "亂引" "亂引爆" "亂引爆戰"
#> [49] "引" "引爆" "引爆戰" "引爆戰火" "爆" "爆戰"
#> [55] "爆戰火" "戰" "戰火" "火"
Try PTT
word_count_table <- dt[, paste(post_text, push_text)] %>%
str_split("[\n\\s]+") %>%
unlist %>%
tokenize_character_shingles(n = 4, n_min = 1) %>%
unlist %>%
table
dt_wc <- word_count_table %>% as.data.table %>% setnames(c("word", "N"))
dt_wc
dt_wc[, ngram := str_length(word)]
dt_wc[, p := N / sum(N), by = .(ngram)]
# bi-gram PMI
bi_wc <- copy(dt_wc[ngram == 2])
bi_wc[, `:=`(w1 = str_sub(word, 1, -2),
w2 = str_sub(word, -1, -1))]
bi_wc <- bi_wc %>%
merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>%
merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>%
.[, pmi := log((p^1) / (p1 * p2))]
(bi_wc_final <- bi_wc[(pmi > 6 & N > 10) | N * pmi > 600] %>%
.[! word %>% str_detect("\\d")] %>%
.[order(-N, -pmi)])
# tri-gram PMI
tri_wc <- copy(dt_wc[ngram == 3])
tri_wc_right <- tri_wc[, `:=`(w1 = str_sub(word, 1, -2),
w2 = str_sub(word, -1, -1))] %>% copy
tri_wc_left <- tri_wc[, `:=`(w1 = str_sub(word, 1, 1),
w2 = str_sub(word, 2, -1))] %>% copy
tri_wc_right <- tri_wc_right %>%
merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>%
merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>%
.[, pmi := log((p^1) / (p1 * p2))]
tri_wc_left <- tri_wc_left %>%
merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>%
merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>%
.[, pmi := log((p^1) / (p1 * p2))]
(tri_wc_final <- tri_wc_right[(pmi > 10 & N > 20) | N * pmi > 1200][order(-pmi)] %>%
rbind(tri_wc_left[(pmi > 10 & N > 20) | N * pmi > 1200][order(-pmi)]) %>%
unique(by = "word") %>%
.[! word %>% str_detect("\\d")])
# four-gram PMI
four_wc <- copy(dt_wc[ngram == 4])
four_wc_right <- four_wc[, `:=`(w1 = str_sub(word, 1, -2),
w2 = str_sub(word, -1, -1))] %>%
copy %>%
merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>%
merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>%
.[, pmi := log((p^1) / (p1 * p2))]
four_wc_left <- four_wc[, `:=`(w1 = str_sub(word, 1, 1),
w2 = str_sub(word, 2, -1))] %>%
copy %>%
merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>%
merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>%
.[, pmi := log((p^1) / (p1 * p2))]
(four_wc_final <- four_wc_right[(pmi > 8 & N > 20) | N * pmi > 1600][order(-pmi)] %>%
rbind(four_wc_left[(pmi > 8 & N > 20) | N * pmi > 1600][order(-pmi)]) %>%
unique(by = "word") %>%
.[! word %>% str_detect("\\d")])
#> w2 w1 word N ngram p p1
#> <char> <char> <char> <int> <int> <num> <num>
#> 1: 灣 台 台灣 3846 2 4.160397e-03 7.374375e-03
#> 2: d d dd 2426 2 2.624317e-03 4.940215e-03
#> 3: 是 就 就是 2230 2 2.412295e-03 8.249254e-03
#> 4: 是 不 不是 2104 2 2.275995e-03 1.624921e-02
#> 5: 有 沒 沒有 1978 2 2.139695e-03 5.579801e-03
#> ---
#> 1525: 姨 紅 紅姨 11 2 1.189921e-05 4.444420e-04
#> 1526: 物 偽 偽物 11 2 1.189921e-05 4.761878e-05
#> 1527: 疑 嫌 嫌疑 11 2 1.189921e-05 1.372541e-04
#> 1528: 果 芒 芒果 11 2 1.189921e-05 1.680663e-05
#> 1529: 掉 抄 抄掉 11 2 1.189921e-05 6.442541e-05
#> p2 pmi
#> <num> <num>
#> 1: 0.0038347124 4.991260
#> 2: 0.0049402151 4.677758
#> 3: 0.0173388384 2.825262
#> 4: 0.0173388384 2.089179
#> 5: 0.0109793969 3.553245
#> ---
#> 1525: 0.0000616243 6.074107
#> 1526: 0.0005919668 6.045305
#> 1527: 0.0002091492 6.027101
#> 1528: 0.0017133424 6.024008
#> 1529: 0.0004528453 6.010924
#> w2 w1 word N ngram p p1
#> <char> <char> <char> <int> <int> <num> <num>
#> 1: 昆 吳茂 吳茂昆 24 3 3.052192e-05 2.704366e-05
#> 2: 閔 管中 管中閔 26 3 3.306542e-05 3.137065e-05
#> 3: 燕 盧秀 盧秀燕 43 3 5.468512e-05 4.651510e-05
#> 4: 峻 陳景 陳景峻 42 3 5.341337e-05 4.759685e-05
#> 5: 址 短網 短網址 172 3 2.187405e-04 1.860604e-04
#> 6: 蓮 呂秀 呂秀蓮 176 3 2.238274e-04 1.936326e-04
#> 7: 飽 吃到 吃到飽 257 3 3.268389e-04 3.104612e-04
#> 8: z zz zzz 174 3 2.212840e-04 2.542104e-04
#> 9: 樂 日快 日快樂 291 3 3.700783e-04 3.158700e-04
#> 10: 值 灣價 灣價值 443 3 5.633839e-04 4.824589e-04
#> 11: 哲 柯文 柯文哲 741 3 9.423644e-04 8.199638e-04
#> 12: 親 一家 一家親 286 3 3.637196e-04 3.980827e-04
#> 13: 卦 的八 的八卦 172 3 2.187405e-04 2.022866e-04
#> 14: 運 世大 世大運 241 3 3.064910e-04 2.617827e-04
#> 15: 候 的時 的時候 284 3 3.611761e-04 4.251264e-04
#> 16: 聞 整新 整新聞 424 3 5.392207e-04 4.586605e-04
#> 17: 府 民政 民政府 285 3 3.624479e-04 3.180335e-04
#> 18: 結 聞連 聞連結 180 3 2.289144e-04 1.957961e-04
#> 19: w ww www 204 3 2.594364e-04 2.909898e-04
#> 20: 嘔 嘔嘔 嘔嘔嘔 912 3 1.159833e-03 1.179104e-03
#> 21: 快 生日 生日快 282 3 3.586326e-04 3.558946e-04
#> 22: 喔 喔喔 喔喔喔 204 3 2.594364e-04 2.758454e-04
#> 23: 黨 民進 民進黨 846 3 1.075898e-03 9.292202e-04
#> 24: 新 完整 完整新 424 3 5.392207e-04 4.889494e-04
#> 25: t km kmt 248 3 3.153932e-04 2.769271e-04
#> 26: 得 我覺 我覺得 231 3 2.937735e-04 2.574557e-04
#> 27: 道 不知 不知道 433 3 5.506664e-04 5.722439e-04
#> 28: 哈 哈哈 哈哈哈 558 3 7.096348e-04 1.074174e-03
#> 29: t pt ptt 271 3 3.446434e-04 3.191152e-04
#> 30: 黨 國民 國民黨 352 3 4.476549e-04 4.900312e-04
#> 31: 文 蔡英 蔡英文 283 3 3.599044e-04 3.115430e-04
#> 32: 家 岸一 岸一家 240 3 3.052192e-04 2.639461e-04
#> 33: 麼 為什 為什麼 347 3 4.412962e-04 3.753660e-04
#> 34: p dp dpp 583 3 7.414284e-04 6.890725e-04
#> 35: 市 台北 台北市 352 3 4.476549e-04 8.567432e-04
#> 36: d dd ddd 1900 3 2.416319e-03 2.624317e-03
#> 37: 價 台灣 台灣價 426 3 5.417642e-04 4.160397e-03
#> 38: d xd xdd 491 3 6.244277e-04 1.086073e-03
#> 39: 有 有沒 有沒有 545 3 6.931020e-04 6.068598e-04
#> 40: 是 是不 是不是 387 3 4.921660e-04 8.178004e-04
#> 41: 動員 運 運動員 188 3 2.390884e-04 8.748784e-04
#> 42: 民國 華 華民國 198 3 2.518059e-04 8.823480e-04
#> 43: 岸一 兩 兩岸一 241 3 3.064910e-04 1.340795e-03
#> 44: 己的 自 自己的 271 3 3.446434e-04 3.391204e-03
#> 45: 來就 本 本來就 239 3 3.039475e-04 2.728276e-03
#> 46: 的很 真 真的很 291 3 3.700783e-04 2.992514e-03
#> 47: 灣人 台 台灣人 441 3 5.608404e-04 7.374375e-03
#> w2 w1 word N ngram p p1
#> p2 pmi
#> <num> <num>
#> 1: 2.801105e-05 10.603904
#> 2: 2.894475e-05 10.502737
#> 3: 4.295027e-05 10.217282
#> 4: 4.388397e-05 10.149256
#> 5: 1.848729e-04 8.757656
#> 6: 3.025193e-04 8.248278
#> 7: 3.874862e-04 7.907239
#> 8: 4.239005e-04 7.627296
#> 9: 6.293149e-04 7.529263
#> 10: 7.301546e-04 7.377320
#> 11: 7.703038e-04 7.307857
#> 12: 6.554585e-04 7.239899
#> 13: 8.057845e-04 7.201895
#> 14: 8.748784e-04 7.199099
#> 15: 6.479889e-04 7.178616
#> 16: 9.757182e-04 7.094151
#> 17: 1.005597e-03 7.032898
#> 18: 1.082160e-03 6.985071
#> 19: 8.356629e-04 6.972508
#> 20: 1.198873e-03 6.709895
#> 21: 1.236221e-03 6.703360
#> 22: 1.202608e-03 6.661934
#> 23: 2.230613e-03 6.252044
#> 24: 2.192331e-03 6.220656
#> 25: 2.495784e-03 6.123218
#> 26: 2.535934e-03 6.109155
#> 27: 2.147514e-03 6.105008
#> 28: 1.505127e-03 6.084321
#> 29: 2.495784e-03 6.070110
#> 30: 2.230613e-03 6.015032
#> 31: 3.270757e-03 5.867035
#> 32: 3.419215e-03 5.823630
#> 33: 3.816972e-03 5.730112
#> 34: 3.601287e-03 5.699696
#> 35: 1.993453e-03 5.568771
#> 36: 4.940215e-03 5.227771
#> 37: 1.076558e-03 4.795451
#> 38: 4.940215e-03 4.756858
#> 39: 1.097940e-02 4.644614
#> 40: 1.733884e-02 3.547004
#> 41: 2.347390e-04 7.059785
#> 42: 2.509652e-04 7.036268
#> 43: 2.639461e-04 6.763935
#> 44: 2.996438e-04 5.826486
#> 45: 3.569763e-04 5.743271
#> 46: 4.045732e-04 5.722524
#> 47: 5.333010e-04 4.960094
#> p2 pmi
#> w2 w1 word N ngram p p1
#> <char> <char> <char> <int> <int> <num> <num>
#> 1: 妙 莫名其 莫名其妙 24 4 3.626643e-05 3.179367e-05
#> 2: 閱 先仔細 先仔細閱 25 4 3.777753e-05 3.179367e-05
#> 3: 稿 社論特 社論特稿 32 4 4.835524e-05 4.069590e-05
#> 4: 壇 雙城論 雙城論壇 22 4 3.324423e-05 2.797843e-05
#> 5: 勁 越不對 越不對勁 34 4 5.137744e-05 5.850036e-05
#> 6: 詢 檢舉板 檢舉板詢 26 4 3.928863e-05 3.306542e-05
#> 7: 畜 支那賤 支那賤畜 39 4 5.893295e-05 5.341337e-05
#> 8: 構 精細結 精細結構 42 4 6.346625e-05 5.595686e-05
#> 9: 鬧 重者以 重者以鬧 25 4 3.777753e-05 3.179367e-05
#> 10: 禁 板嚴格 板嚴格禁 25 4 3.777753e-05 3.179367e-05
#> 11: 細 請先仔 請先仔細 25 4 3.777753e-05 3.179367e-05
#> 12: 仔 前請先 前請先仔 25 4 3.777753e-05 3.433717e-05
#> 13: 替 政黨輪 政黨輪替 29 4 4.382194e-05 3.688066e-05
#> 14: 址 供短網 供短網址 32 4 4.835524e-05 4.069590e-05
#> 15: 址 或短網 或短網址 140 4 2.115542e-04 1.780446e-04
#> 16: 止 嚴格禁 嚴格禁止 25 4 3.777753e-05 3.179367e-05
#> 17: 嗚 嗚嗚嗚 嗚嗚嗚嗚 23 4 3.475533e-05 5.086987e-05
#> 18: 狀 維持現 維持現狀 34 4 5.137744e-05 4.323939e-05
#> 19: 訪 電台專 電台專訪 25 4 3.777753e-05 3.179367e-05
#> 20: 桶 將被水 將被水桶 25 4 3.777753e-05 3.179367e-05
#> 21: 桶 文字水 文字水桶 25 4 3.777753e-05 3.179367e-05
#> 22: 桶 治類水 治類水桶 32 4 4.835524e-05 4.069590e-05
#> 23: 桶 貼者水 貼者水桶 43 4 6.497736e-05 5.468512e-05
#> 24: 章 充實文 充實文章 25 4 3.777753e-05 3.179367e-05
#> 25: 章 刪除文 刪除文章 32 4 4.835524e-05 4.069590e-05
#> 26: 濟 共享經 共享經濟 26 4 3.928863e-05 3.433717e-05
#> 27: 嚴 本看板 本看板嚴 25 4 3.777753e-05 3.179367e-05
#> 28: 驗 智力測 智力測驗 25 4 3.777753e-05 3.179367e-05
#> 29: 額 刪也算 刪也算額 43 4 6.497736e-05 8.647879e-05
#> 30: 擊 網友點 網友點擊 32 4 4.835524e-05 4.196765e-05
#> 31: 注 問卦請 問卦請注 25 4 3.777753e-05 3.179367e-05
#> 32: 讀 仔細閱 仔細閱讀 25 4 3.777753e-05 3.179367e-05
#> 33: 欠 互不相 互不相欠 21 4 3.173313e-05 2.925018e-05
#> 34: 醒 看完提 看完提醒 25 4 3.777753e-05 3.179367e-05
#> 35: 刪 刪及被 刪及被刪 25 4 3.777753e-05 3.179367e-05
#> 36: 刪 刪或自 刪或自刪 43 4 6.497736e-05 5.468512e-05
#> 37: 刪 提醒請 提醒請刪 25 4 3.777753e-05 3.179367e-05
#> 38: 刪 照板規 照板規刪 32 4 4.835524e-05 4.069590e-05
#> 39: 態 意識形 意識形態 21 4 3.173313e-05 2.670668e-05
#> 40: 短 需提供 需提供短 32 4 4.835524e-05 4.069590e-05
#> 41: 寫 題沒有 題沒有寫 32 4 4.835524e-05 4.069590e-05
#> 42: 篇 也算兩 也算兩篇 25 4 3.777753e-05 3.179367e-05
#> 43: 規 依照板 依照板規 32 4 4.835524e-05 4.069590e-05
#> 44: 規 相關板 相關板規 25 4 3.777753e-05 3.179367e-05
#> 45: 廠 深澳電 深澳電廠 23 4 3.475533e-05 3.052192e-05
#> 46: 群 客原族 客原族群 21 4 3.173313e-05 2.670668e-05
#> 47: 刪 也會被 也會被刪 32 4 4.835524e-05 5.214162e-05
#> 48: 格 看板嚴 看板嚴格 25 4 3.777753e-05 3.179367e-05
#> 49: 源 媒體來 媒體來源 145 4 2.191097e-04 1.869468e-04
#> 50: 則 張貼一 張貼一則 43 4 6.497736e-05 5.468512e-05
#> 51: 亞 馬來西 馬來西亞 83 4 1.254214e-04 1.080985e-04
#> 52: 樂 生日快 生日快樂 282 4 4.261306e-04 3.586326e-04
#> 53: 親 岸一家 岸一家親 221 4 3.339534e-04 3.052192e-04
#> 54: 值 台灣價 台灣價值 423 4 6.391959e-04 5.417642e-04
#> 55: 聞 完整新 完整新聞 424 4 6.407070e-04 5.392207e-04
#> 56: 嘔 嘔嘔嘔 嘔嘔嘔嘔 740 4 1.118215e-03 1.159833e-03
#> 57: 哈 哈哈哈 哈哈哈哈 312 4 4.714636e-04 7.096348e-04
#> 58: d ddd dddd 1531 4 2.313496e-03 2.416319e-03
#> 59: d xdd xddd 369 4 5.575964e-04 6.244277e-04
#> 60: 讀相關 閱 閱讀相關 25 4 3.777753e-05 7.656353e-05
#> 61: 台支那 滯 滯台支那 33 4 4.986634e-05 7.843093e-05
#> 62: 體中文 繁 繁體中文 48 4 7.253286e-05 8.216574e-05
#> 63: 合報導 綜 綜合報導 70 4 1.057771e-04 1.073757e-04
#> 64: 都不能 稿 稿都不能 32 4 4.835524e-05 7.843093e-05
#> 65: 騙集團 詐 詐騙集團 40 4 6.044405e-05 1.260497e-04
#> 66: 客原族 閩 閩客原族 21 4 3.173313e-05 1.279171e-04
#> 67: 存在感 刷 刷存在感 34 4 5.137744e-05 1.120442e-04
#> 68: 止政治 禁 禁止政治 25 4 3.777753e-05 1.456574e-04
#> 69: 閱讀相 細 細閱讀相 25 4 3.777753e-05 1.465911e-04
#> 70: 一支持 唯 唯一支持 53 4 8.008837e-05 1.493923e-04
#> 71: 結構常 細 細結構常 34 4 5.137744e-05 1.465911e-04
#> 72: 果日報 蘋 蘋果日報 61 4 9.217718e-05 1.605967e-04
#> 73: 方便網 址 址方便網 32 4 4.835524e-05 1.848729e-04
#> 74: 政治問 止 止政治問 25 4 3.777753e-05 1.914088e-04
#> 75: 告也會 廣 廣告也會 32 4 4.835524e-05 2.250221e-04
#> 76: 重者以 嚴 嚴重者以 25 4 3.777753e-05 2.324917e-04
#> 77: 有專板 否 否有專板 25 4 3.777753e-05 2.427624e-04
#> 78: 相關板 讀 讀相關板 25 4 3.777753e-05 2.605027e-04
#> 79: 請刪除 醒 醒請刪除 25 4 3.777753e-05 2.810442e-04
#> 80: 也算兩 刪 刪也算兩 25 4 3.777753e-05 2.857127e-04
#> 81: 如蘋果 例 例如蘋果 34 4 5.137744e-05 2.857127e-04
#> 82: 除ct 刪 刪除ct 25 4 3.777753e-05 2.857127e-04
#> 83: 網址方 短 短網址方 32 4 4.835524e-05 2.997182e-04
#> 84: 摩新聞 奇 奇摩新聞 37 4 5.591075e-05 3.230607e-04
#> 85: 曲棍球 吞 吞曲棍球 24 4 3.626643e-05 1.185801e-04
#> 86: 刪除文 規 規刪除文 32 4 4.835524e-05 3.380000e-04
#> 87: 者刪除 違 違者刪除 32 4 4.835524e-05 3.650773e-04
#> 88: 禁止政 格 格禁止政 25 4 3.777753e-05 3.678784e-04
#> 89: apa j japa 63 4 9.519938e-05 3.641436e-04
#> 90: 岸一家 兩 兩岸一家 238 4 3.596421e-04 1.340795e-03
#> w2 w1 word N ngram p p1
#> p2 pmi
#> <num> <num>
#> 1: 7.096132e-05 9.685001
#> 2: 7.656353e-05 9.649837
#> 3: 7.843093e-05 9.625739
#> 4: 9.056905e-05 9.481845
#> 5: 8.403314e-05 9.254466
#> 6: 1.139116e-04 9.252535
#> 7: 1.176464e-04 9.146166
#> 8: 1.269834e-04 9.097381
#> 9: 1.363204e-04 9.072950
#> 10: 1.456574e-04 9.006700
#> 11: 1.465911e-04 9.000310
#> 12: 1.493923e-04 8.904421
#> 13: 1.690000e-04 8.858059
#> 14: 1.848729e-04 8.768289
#> 15: 1.848729e-04 8.768289
#> 16: 1.914088e-04 8.733546
#> 17: 1.101768e-04 8.732486
#> 18: 2.184862e-04 8.601235
#> 19: 2.231547e-04 8.580093
#> 20: 2.240884e-04 8.575917
#> 21: 2.240884e-04 8.575917
#> 22: 2.240884e-04 8.575917
#> 23: 2.240884e-04 8.575917
#> 24: 2.250221e-04 8.571759
#> 25: 2.250221e-04 8.571759
#> 26: 2.231547e-04 8.542352
#> 27: 2.324917e-04 8.539103
#> 28: 2.427624e-04 8.495875
#> 29: 1.540608e-04 8.492303
#> 30: 2.483646e-04 8.442288
#> 31: 2.605027e-04 8.425344
#> 32: 2.605027e-04 8.425344
#> 33: 2.530331e-04 8.363466
#> 34: 2.810442e-04 8.349446
#> 35: 2.857127e-04 8.332971
#> 36: 2.857127e-04 8.332971
#> 37: 2.857127e-04 8.332971
#> 38: 2.857127e-04 8.332971
#> 39: 2.885138e-04 8.323215
#> 40: 2.997182e-04 8.285115
#> 41: 3.258619e-04 8.201484
#> 42: 3.323978e-04 8.181626
#> 43: 3.380000e-04 8.164912
#> 44: 3.380000e-04 8.164912
#> 45: 3.351989e-04 8.130674
#> 46: 3.520055e-04 8.124311
#> 47: 2.857127e-04 8.085135
#> 48: 3.678784e-04 8.080205
#> 49: 3.678784e-04 8.066506
#> 50: 3.734806e-04 8.065092
#> 51: 3.837514e-04 8.014152
#> 52: 6.293149e-04 7.543326
#> 53: 6.554585e-04 7.420147
#> 54: 7.301546e-04 7.387634
#> 55: 9.757182e-04 7.104784
#> 56: 1.198873e-03 6.689831
#> 57: 1.505127e-03 6.089970
#> 58: 4.940215e-03 5.266861
#> 59: 4.940215e-03 5.197146
#> 60: 3.179367e-05 9.649837
#> 61: 4.196765e-05 9.625739
#> 62: 6.613084e-05 9.499177
#> 63: 8.902228e-05 9.311624
#> 64: 5.977210e-05 9.241328
#> 65: 5.086987e-05 9.151281
#> 66: 2.670668e-05 9.136575
#> 67: 5.214162e-05 9.081853
#> 68: 3.179367e-05 9.006700
#> 69: 3.179367e-05 9.000310
#> 70: 6.740258e-05 8.981382
#> 71: 4.451114e-05 8.971323
#> 72: 7.757656e-05 8.909062
#> 73: 4.069590e-05 8.768289
#> 74: 3.433717e-05 8.656585
#> 75: 4.069590e-05 8.571759
#> 76: 3.179367e-05 8.539103
#> 77: 3.306542e-05 8.456654
#> 78: 3.179367e-05 8.425344
#> 79: 3.179367e-05 8.349446
#> 80: 3.179367e-05 8.332971
#> 81: 4.323939e-05 8.332971
#> 82: 3.179367e-05 8.332971
#> 83: 4.069590e-05 8.285115
#> 84: 4.705463e-05 8.210117
#> 85: 8.393529e-05 8.200768
#> 86: 4.069590e-05 8.164912
#> 87: 4.069590e-05 8.087849
#> 88: 3.179367e-05 8.080205
#> 89: 8.393529e-05 8.043889
#> 90: 3.052192e-04 6.778571
#> p2 pmi
bi_wc_final[, .(word, N)] %>%
write.table("dict/user_dict_utf8.txt", quote = FALSE,
append = FALSE,
sep = " ",
row.names = FALSE, col.names = FALSE)
tri_wc_final[, .(word, N)] %>%
write.table("dict/user_dict_utf8.txt", quote = FALSE,
append = TRUE,
sep = " ",
row.names = FALSE, col.names = FALSE)
four_wc_final[, .(word, N)] %>%
write.table("dict/user_dict_utf8.txt", quote = FALSE,
append = TRUE,
sep = " ",
row.names = FALSE, col.names = FALSE)
mix_seg <- worker(type = "mix",
dict = "dict/jieba_dict_utf8.txt",
stop_word = "dict/stop_utf8.txt",
user = "dict/user_dict_utf8.txt",
symbol = FALSE,
encoding = "UTF-8")
# tokenize
text_seg <- dt[, paste(post_text, push_text)] %>% lapply(cutter, mix_seg)
segment("中華電信499吃到飽之亂引爆戰火", mix_seg)
#> [1] "中華" "電信" "499" "吃到飽" "之亂" "引爆" "戰火"
# text2vec
text_token <- itoken(text_seg)
vocab <- create_vocabulary(
text_token,
ngram=c(1L, 2L),
sep_ngram = "_"
)
pruned_vocab <- prune_vocabulary(
vocab,
term_count_min = 10,
doc_proportion_min = 0.001,
doc_proportion_max = 0.9,
vocab_term_max = 20000
)
# class(pruned_vocab) <- c("text2vec_vocabulary", "data.table", "data.frame")
pruned_vocab <- pruned_vocab[str_length(pruned_vocab$term) >= 2,] # remove 1-word term
# Make DTM
vectorizer <- vocab_vectorizer(pruned_vocab)
dtm <- create_dtm(text_token, vectorizer)
# Check most freq terms
Matrix::colSums(dtm) %>% sort(decreasing = T) %>% head(20)
#> 台灣 中國 一個 都是 五樓 的人 民進黨 應該 垃圾 美國
#> 3008 1604 1331 963 962 865 846 842 811 808
#> 柯文哲 根本 支持 也是 日本 台北 市長 不用 柯粉 總統
#> 741 713 711 709 686 604 603 533 532 527
wordVectors
# Prepare tokenizes text file
tokenize_text_lines <- text_seg %>%
sapply(paste, collapse = " ") # Tokens are split on spaces.
tokenize_text_lines %>% writeLines("dataset/tokenize_text_lines.txt")
# Fit models
vector_set <- train_word2vec(train_file = "dataset/tokenize_text_lines.txt",
output_file = "dataset/ptt_gossiping_word2vec.bin",
force = TRUE,
vectors = 200,
threads = parallel::detectCores()-1,
window = 6)
vector_set <- vector_set[-1,]
相近關聯詞
nearest_to(vector_set, vector_set[["柯文哲"]], n = 20)
#> 柯文哲 柯文 哲為 但柯 輸誠
#> 8.881784e-16 2.163745e-01 2.461457e-01 2.645223e-01 2.653944e-01
#> 表態 柯說 墨綠 綠營 脫口而出
#> 2.726296e-01 2.801413e-01 2.809439e-01 2.880358e-01 2.892135e-01
#> 直言 民進 氣勢 勇弟 陳景峻
#> 2.896340e-01 2.899235e-01 2.918302e-01 2.939741e-01 2.951965e-01
#> 修補 選對會 專訪 政見 致歉
#> 3.020913e-01 3.021007e-01 3.068234e-01 3.087859e-01 3.103285e-01
nearest_to(vector_set, vector_set[["499"]], n = 20)
#> 499 吃到飽 399 183 299
#> 7.771561e-16 1.576027e-01 1.827954e-01 1.962149e-01 1.985694e-01
#> 資費 不限 NP 限時 699
#> 2.024386e-01 2.029390e-01 2.070329e-01 2.101635e-01 2.125551e-01
#> 限速 剛出 退傭 方案 之亂
#> 2.150124e-01 2.222288e-01 2.267977e-01 2.270077e-01 2.274960e-01
#> 搶辦 月繳 種花 599 中華電信
#> 2.278977e-01 2.295549e-01 2.307604e-01 2.310141e-01 2.311899e-01
nearest_to(vector_set, vector_set[["台灣價值"]], n = 20)
#> 台灣 說 會 沒 中國 一個 喔 柯 想 推
#> NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
#> 幹 新聞 五樓 民進黨 應該 年 垃圾 做 美國 支持
#> NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Dimension reduction with t-SNE
# tsne_vec <- tsne(vector_set, k = 2, epoch = 10)
fit_pca <- prcomp(vector_set, scale = FALSE)
fviz_eig(fit_pca)
dt_pca <- fit_pca$x %>% as.data.table(keep.rownames = FALSE)
dt_pca[, word := rownames(fit_pca$x)]
db <- dbscan(dt_pca[, .(PC1, PC2, PC3, PC4, PC5)], eps = .4, minPts = 4)
dt_pca_2 <- dt_pca[, .(word, PC1, PC2, cluster = db$cluster)]
Plotly
p <- plot_ly(
data = dt_pca_2, x = ~PC1, y = ~PC2,
# color = ~cluster,
# Hover text:
text = ~paste(word)
)
p
#> No trace type specified:
#> Based on info supplied, a 'scatter' trace seems appropriate.
#> Read more about this trace type -> https://plot.ly/r/reference/#scatter
#> No scatter mode specifed:
#> Setting the mode to markers
#> Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
# 利用 tf-idf 關鍵詞算法,處理高頻詞高估及低頻詞低估的問題,取得整個文檔的關鍵詞
# tf-idf
tfidf = TfIdf$new() # define tfidf model
# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm, tfidf)
# tfidf modified by fit_transform() call!
# Key term
key_term <- dtm_train_tfidf %>%
find_freq_terms(lowfreq = 0.05) %>%
Matrix::colSums(.) %>%
data.frame() %>%
data.table(keep.rownames = TRUE) %>%
setnames(c("keyword", "sum_tf_idf")) %>%
.[order(-sum_tf_idf)]
# key_term %>% head(100) %>% DT::datatable(extensions = "Responsive")
# Wordcloud
d <- key_term %>% head(200)
ncolor <- nrow(d)
getPalette = colorRampPalette(RColorBrewer::brewer.pal(8, "Set2"))
wordcloud2(d,
size = 0.5,
fontFamily = "Noto Sans CJK TC",
fontWeight = "normal",
rotateRatio = 0,
color = getPalette(ncolor),
shape = "circle")