# Load Packages
suppressPackageStartupMessages({
  library(magrittr)
  library(DBI)
  library(data.table); options(datatable.print.class = TRUE)
  library(RSQLite)
  # library(tidytext)
  library(tokenizers)
  library(text2vec)
  library(ggplot2)
  library(plotly)
  library(jiebaR)
  library(stringr)
  # devtools::install_github("bmschmidt/wordVectors")
  library(wordVectors)
  library(wordcloud2)
  # library(tsne)
  library(dbscan)
  library(factoextra)
})

source("R/utils.R", encoding = "UTF-8")
source("R/filter_dtm.R", encoding = "UTF-8")

Load data

con <- dbConnect(RSQLite::SQLite(), "./dataset/db.sqlite")
dt <- dbReadTable(con, "Gossiping") %>% setDT
dbDisconnect(con)

Data cleansing

# dt[sample(seq(.N), 5)] %>% View  # take a look

# Remove reply text and messages
dt[title %>% str_detect("^Re:"),
    `:=`(post_text = post_text %>% 
           str_replace_all('(?m)^(:|※).*$', "") %>% 
           str_replace_all('(?im)(posted|sent) from.*$', ""))]

# Remove URLs
url_regex = 'http[s]?://(?:[a-zA-Z0-9$-_@.&+!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+(?:#[-A-z0-9]+)?'
dt[, `:=`(post_text = post_text %>% str_replace_all(url_regex, ""),
          push_text = push_text %>% str_replace_all(url_regex, ""))]


# Remove all but Eng and Zht
dt[, `:=`(post_text = post_text %>%
            str_replace_all("[^\u4e00-\u9fa5A-Za-z\\d\\n\\s]+", " "),
          push_text = push_text %>%
            str_replace_all("[^\u4e00-\u9fa5A-Za-z\\d\\n\\s]+", " "))]

# Filter
dt <- dt[! (str_detect(title, "置底|公告") | str_detect(post_text, "置底|公告"))]

dt[sample(seq(.N), 5)] %>% View  # take a look again

Tokenization

# 起手式,結巴建立斷詞器
mix_seg <- worker(type = "mix",
                  dict = "dict/jieba_dict_utf8.txt",
                  stop_word = "dict/stop_utf8.txt",
                  symbol = FALSE,
                  encoding = "UTF-8")

# tokenize
text_seg <- dt[, paste(post_text, push_text)] %>% lapply(cutter, mix_seg)
segment("中華電信499吃到飽之亂引爆戰火", mix_seg)
#> [1] "中華電信" "499"      "吃"       "飽"       "之亂"     "引爆"    
#> [7] "戰火"

New-term detection

Play with n-grams

x <- "中華電信499吃到飽之亂引爆戰火"
tokenize_character_shingles(x, n = 4, n_min = 1, simplify = TRUE)
#>  [1] "中"       "中華"     "中華電"   "中華電信" "華"       "華電"    
#>  [7] "華電信"   "華電信4"  "電"       "電信"     "電信4"    "電信49"  
#> [13] "信"       "信4"      "信49"     "信499"    "4"        "49"      
#> [19] "499"      "499吃"    "9"        "99"       "99吃"     "99吃到"  
#> [25] "9"        "9吃"      "9吃到"    "9吃到飽"  "吃"       "吃到"    
#> [31] "吃到飽"   "吃到飽之" "到"       "到飽"     "到飽之"   "到飽之亂"
#> [37] "飽"       "飽之"     "飽之亂"   "飽之亂引" "之"       "之亂"    
#> [43] "之亂引"   "之亂引爆" "亂"       "亂引"     "亂引爆"   "亂引爆戰"
#> [49] "引"       "引爆"     "引爆戰"   "引爆戰火" "爆"       "爆戰"    
#> [55] "爆戰火"   "戰"       "戰火"     "火"

Try PTT

word_count_table <- dt[, paste(post_text, push_text)] %>% 
  str_split("[\n\\s]+") %>% 
  unlist %>% 
  tokenize_character_shingles(n = 4, n_min = 1) %>%
  unlist %>% 
  table
dt_wc <- word_count_table %>% as.data.table %>% setnames(c("word", "N"))
dt_wc

dt_wc[, ngram := str_length(word)]
dt_wc[, p := N / sum(N), by = .(ngram)]

# bi-gram PMI
bi_wc <- copy(dt_wc[ngram == 2])
bi_wc[, `:=`(w1 = str_sub(word, 1, -2), 
             w2 = str_sub(word, -1, -1))]
bi_wc <- bi_wc %>% 
  merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>%
  merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>% 
  .[, pmi := log((p^1) / (p1 * p2))]
(bi_wc_final <- bi_wc[(pmi > 6 & N > 10) | N * pmi > 600] %>% 
    .[! word %>% str_detect("\\d")] %>% 
    .[order(-N, -pmi)])


# tri-gram PMI
tri_wc <- copy(dt_wc[ngram == 3])
tri_wc_right <- tri_wc[, `:=`(w1 = str_sub(word, 1, -2), 
                              w2 = str_sub(word, -1, -1))] %>% copy
tri_wc_left <- tri_wc[, `:=`(w1 = str_sub(word, 1, 1), 
                             w2 = str_sub(word, 2, -1))] %>% copy
tri_wc_right <- tri_wc_right %>% 
  merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>% 
  merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>% 
  .[, pmi := log((p^1) / (p1 * p2))]
tri_wc_left <- tri_wc_left %>% 
  merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>% 
  merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>% 
  .[, pmi := log((p^1) / (p1 * p2))]

(tri_wc_final <- tri_wc_right[(pmi > 10 & N > 20) | N * pmi > 1200][order(-pmi)] %>% 
  rbind(tri_wc_left[(pmi > 10 & N > 20) | N * pmi > 1200][order(-pmi)]) %>% 
    unique(by = "word") %>% 
    .[! word %>% str_detect("\\d")])

# four-gram PMI
four_wc <- copy(dt_wc[ngram == 4])

four_wc_right <- four_wc[, `:=`(w1 = str_sub(word, 1, -2), 
                              w2 = str_sub(word, -1, -1))] %>% 
  copy %>% 
  merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>% 
  merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>% 
  .[, pmi := log((p^1) / (p1 * p2))]
four_wc_left <- four_wc[, `:=`(w1 = str_sub(word, 1, 1), 
                             w2 = str_sub(word, 2, -1))] %>% 
  copy %>% 
  merge(dt_wc[, .(word, p1 = p)], by.x = "w1", by.y = "word", all.x = TRUE) %>% 
  merge(dt_wc[, .(word, p2 = p)], by.x = "w2", by.y = "word", all.x = TRUE) %>% 
  .[, pmi := log((p^1) / (p1 * p2))]

(four_wc_final <- four_wc_right[(pmi > 8 & N > 20) | N * pmi > 1600][order(-pmi)] %>% 
  rbind(four_wc_left[(pmi > 8 & N > 20) | N * pmi > 1600][order(-pmi)]) %>% 
    unique(by = "word") %>% 
    .[! word %>% str_detect("\\d")])
#>           w2     w1   word     N ngram            p           p1
#>       <char> <char> <char> <int> <int>        <num>        <num>
#>    1:     灣     台   台灣  3846     2 4.160397e-03 7.374375e-03
#>    2:      d      d     dd  2426     2 2.624317e-03 4.940215e-03
#>    3:     是     就   就是  2230     2 2.412295e-03 8.249254e-03
#>    4:     是     不   不是  2104     2 2.275995e-03 1.624921e-02
#>    5:     有     沒   沒有  1978     2 2.139695e-03 5.579801e-03
#>   ---                                                           
#> 1525:     姨     紅   紅姨    11     2 1.189921e-05 4.444420e-04
#> 1526:     物     偽   偽物    11     2 1.189921e-05 4.761878e-05
#> 1527:     疑     嫌   嫌疑    11     2 1.189921e-05 1.372541e-04
#> 1528:     果     芒   芒果    11     2 1.189921e-05 1.680663e-05
#> 1529:     掉     抄   抄掉    11     2 1.189921e-05 6.442541e-05
#>                 p2      pmi
#>              <num>    <num>
#>    1: 0.0038347124 4.991260
#>    2: 0.0049402151 4.677758
#>    3: 0.0173388384 2.825262
#>    4: 0.0173388384 2.089179
#>    5: 0.0109793969 3.553245
#>   ---                      
#> 1525: 0.0000616243 6.074107
#> 1526: 0.0005919668 6.045305
#> 1527: 0.0002091492 6.027101
#> 1528: 0.0017133424 6.024008
#> 1529: 0.0004528453 6.010924
#>         w2     w1   word     N ngram            p           p1
#>     <char> <char> <char> <int> <int>        <num>        <num>
#>  1:     昆   吳茂 吳茂昆    24     3 3.052192e-05 2.704366e-05
#>  2:     閔   管中 管中閔    26     3 3.306542e-05 3.137065e-05
#>  3:     燕   盧秀 盧秀燕    43     3 5.468512e-05 4.651510e-05
#>  4:     峻   陳景 陳景峻    42     3 5.341337e-05 4.759685e-05
#>  5:     址   短網 短網址   172     3 2.187405e-04 1.860604e-04
#>  6:     蓮   呂秀 呂秀蓮   176     3 2.238274e-04 1.936326e-04
#>  7:     飽   吃到 吃到飽   257     3 3.268389e-04 3.104612e-04
#>  8:      z     zz    zzz   174     3 2.212840e-04 2.542104e-04
#>  9:     樂   日快 日快樂   291     3 3.700783e-04 3.158700e-04
#> 10:     值   灣價 灣價值   443     3 5.633839e-04 4.824589e-04
#> 11:     哲   柯文 柯文哲   741     3 9.423644e-04 8.199638e-04
#> 12:     親   一家 一家親   286     3 3.637196e-04 3.980827e-04
#> 13:     卦   的八 的八卦   172     3 2.187405e-04 2.022866e-04
#> 14:     運   世大 世大運   241     3 3.064910e-04 2.617827e-04
#> 15:     候   的時 的時候   284     3 3.611761e-04 4.251264e-04
#> 16:     聞   整新 整新聞   424     3 5.392207e-04 4.586605e-04
#> 17:     府   民政 民政府   285     3 3.624479e-04 3.180335e-04
#> 18:     結   聞連 聞連結   180     3 2.289144e-04 1.957961e-04
#> 19:      w     ww    www   204     3 2.594364e-04 2.909898e-04
#> 20:     嘔   嘔嘔 嘔嘔嘔   912     3 1.159833e-03 1.179104e-03
#> 21:     快   生日 生日快   282     3 3.586326e-04 3.558946e-04
#> 22:     喔   喔喔 喔喔喔   204     3 2.594364e-04 2.758454e-04
#> 23:     黨   民進 民進黨   846     3 1.075898e-03 9.292202e-04
#> 24:     新   完整 完整新   424     3 5.392207e-04 4.889494e-04
#> 25:      t     km    kmt   248     3 3.153932e-04 2.769271e-04
#> 26:     得   我覺 我覺得   231     3 2.937735e-04 2.574557e-04
#> 27:     道   不知 不知道   433     3 5.506664e-04 5.722439e-04
#> 28:     哈   哈哈 哈哈哈   558     3 7.096348e-04 1.074174e-03
#> 29:      t     pt    ptt   271     3 3.446434e-04 3.191152e-04
#> 30:     黨   國民 國民黨   352     3 4.476549e-04 4.900312e-04
#> 31:     文   蔡英 蔡英文   283     3 3.599044e-04 3.115430e-04
#> 32:     家   岸一 岸一家   240     3 3.052192e-04 2.639461e-04
#> 33:     麼   為什 為什麼   347     3 4.412962e-04 3.753660e-04
#> 34:      p     dp    dpp   583     3 7.414284e-04 6.890725e-04
#> 35:     市   台北 台北市   352     3 4.476549e-04 8.567432e-04
#> 36:      d     dd    ddd  1900     3 2.416319e-03 2.624317e-03
#> 37:     價   台灣 台灣價   426     3 5.417642e-04 4.160397e-03
#> 38:      d     xd    xdd   491     3 6.244277e-04 1.086073e-03
#> 39:     有   有沒 有沒有   545     3 6.931020e-04 6.068598e-04
#> 40:     是   是不 是不是   387     3 4.921660e-04 8.178004e-04
#> 41:   動員     運 運動員   188     3 2.390884e-04 8.748784e-04
#> 42:   民國     華 華民國   198     3 2.518059e-04 8.823480e-04
#> 43:   岸一     兩 兩岸一   241     3 3.064910e-04 1.340795e-03
#> 44:   己的     自 自己的   271     3 3.446434e-04 3.391204e-03
#> 45:   來就     本 本來就   239     3 3.039475e-04 2.728276e-03
#> 46:   的很     真 真的很   291     3 3.700783e-04 2.992514e-03
#> 47:   灣人     台 台灣人   441     3 5.608404e-04 7.374375e-03
#>         w2     w1   word     N ngram            p           p1
#>               p2       pmi
#>            <num>     <num>
#>  1: 2.801105e-05 10.603904
#>  2: 2.894475e-05 10.502737
#>  3: 4.295027e-05 10.217282
#>  4: 4.388397e-05 10.149256
#>  5: 1.848729e-04  8.757656
#>  6: 3.025193e-04  8.248278
#>  7: 3.874862e-04  7.907239
#>  8: 4.239005e-04  7.627296
#>  9: 6.293149e-04  7.529263
#> 10: 7.301546e-04  7.377320
#> 11: 7.703038e-04  7.307857
#> 12: 6.554585e-04  7.239899
#> 13: 8.057845e-04  7.201895
#> 14: 8.748784e-04  7.199099
#> 15: 6.479889e-04  7.178616
#> 16: 9.757182e-04  7.094151
#> 17: 1.005597e-03  7.032898
#> 18: 1.082160e-03  6.985071
#> 19: 8.356629e-04  6.972508
#> 20: 1.198873e-03  6.709895
#> 21: 1.236221e-03  6.703360
#> 22: 1.202608e-03  6.661934
#> 23: 2.230613e-03  6.252044
#> 24: 2.192331e-03  6.220656
#> 25: 2.495784e-03  6.123218
#> 26: 2.535934e-03  6.109155
#> 27: 2.147514e-03  6.105008
#> 28: 1.505127e-03  6.084321
#> 29: 2.495784e-03  6.070110
#> 30: 2.230613e-03  6.015032
#> 31: 3.270757e-03  5.867035
#> 32: 3.419215e-03  5.823630
#> 33: 3.816972e-03  5.730112
#> 34: 3.601287e-03  5.699696
#> 35: 1.993453e-03  5.568771
#> 36: 4.940215e-03  5.227771
#> 37: 1.076558e-03  4.795451
#> 38: 4.940215e-03  4.756858
#> 39: 1.097940e-02  4.644614
#> 40: 1.733884e-02  3.547004
#> 41: 2.347390e-04  7.059785
#> 42: 2.509652e-04  7.036268
#> 43: 2.639461e-04  6.763935
#> 44: 2.996438e-04  5.826486
#> 45: 3.569763e-04  5.743271
#> 46: 4.045732e-04  5.722524
#> 47: 5.333010e-04  4.960094
#>               p2       pmi
#>         w2     w1     word     N ngram            p           p1
#>     <char> <char>   <char> <int> <int>        <num>        <num>
#>  1:     妙 莫名其 莫名其妙    24     4 3.626643e-05 3.179367e-05
#>  2:     閱 先仔細 先仔細閱    25     4 3.777753e-05 3.179367e-05
#>  3:     稿 社論特 社論特稿    32     4 4.835524e-05 4.069590e-05
#>  4:     壇 雙城論 雙城論壇    22     4 3.324423e-05 2.797843e-05
#>  5:     勁 越不對 越不對勁    34     4 5.137744e-05 5.850036e-05
#>  6:     詢 檢舉板 檢舉板詢    26     4 3.928863e-05 3.306542e-05
#>  7:     畜 支那賤 支那賤畜    39     4 5.893295e-05 5.341337e-05
#>  8:     構 精細結 精細結構    42     4 6.346625e-05 5.595686e-05
#>  9:     鬧 重者以 重者以鬧    25     4 3.777753e-05 3.179367e-05
#> 10:     禁 板嚴格 板嚴格禁    25     4 3.777753e-05 3.179367e-05
#> 11:     細 請先仔 請先仔細    25     4 3.777753e-05 3.179367e-05
#> 12:     仔 前請先 前請先仔    25     4 3.777753e-05 3.433717e-05
#> 13:     替 政黨輪 政黨輪替    29     4 4.382194e-05 3.688066e-05
#> 14:     址 供短網 供短網址    32     4 4.835524e-05 4.069590e-05
#> 15:     址 或短網 或短網址   140     4 2.115542e-04 1.780446e-04
#> 16:     止 嚴格禁 嚴格禁止    25     4 3.777753e-05 3.179367e-05
#> 17:     嗚 嗚嗚嗚 嗚嗚嗚嗚    23     4 3.475533e-05 5.086987e-05
#> 18:     狀 維持現 維持現狀    34     4 5.137744e-05 4.323939e-05
#> 19:     訪 電台專 電台專訪    25     4 3.777753e-05 3.179367e-05
#> 20:     桶 將被水 將被水桶    25     4 3.777753e-05 3.179367e-05
#> 21:     桶 文字水 文字水桶    25     4 3.777753e-05 3.179367e-05
#> 22:     桶 治類水 治類水桶    32     4 4.835524e-05 4.069590e-05
#> 23:     桶 貼者水 貼者水桶    43     4 6.497736e-05 5.468512e-05
#> 24:     章 充實文 充實文章    25     4 3.777753e-05 3.179367e-05
#> 25:     章 刪除文 刪除文章    32     4 4.835524e-05 4.069590e-05
#> 26:     濟 共享經 共享經濟    26     4 3.928863e-05 3.433717e-05
#> 27:     嚴 本看板 本看板嚴    25     4 3.777753e-05 3.179367e-05
#> 28:     驗 智力測 智力測驗    25     4 3.777753e-05 3.179367e-05
#> 29:     額 刪也算 刪也算額    43     4 6.497736e-05 8.647879e-05
#> 30:     擊 網友點 網友點擊    32     4 4.835524e-05 4.196765e-05
#> 31:     注 問卦請 問卦請注    25     4 3.777753e-05 3.179367e-05
#> 32:     讀 仔細閱 仔細閱讀    25     4 3.777753e-05 3.179367e-05
#> 33:     欠 互不相 互不相欠    21     4 3.173313e-05 2.925018e-05
#> 34:     醒 看完提 看完提醒    25     4 3.777753e-05 3.179367e-05
#> 35:     刪 刪及被 刪及被刪    25     4 3.777753e-05 3.179367e-05
#> 36:     刪 刪或自 刪或自刪    43     4 6.497736e-05 5.468512e-05
#> 37:     刪 提醒請 提醒請刪    25     4 3.777753e-05 3.179367e-05
#> 38:     刪 照板規 照板規刪    32     4 4.835524e-05 4.069590e-05
#> 39:     態 意識形 意識形態    21     4 3.173313e-05 2.670668e-05
#> 40:     短 需提供 需提供短    32     4 4.835524e-05 4.069590e-05
#> 41:     寫 題沒有 題沒有寫    32     4 4.835524e-05 4.069590e-05
#> 42:     篇 也算兩 也算兩篇    25     4 3.777753e-05 3.179367e-05
#> 43:     規 依照板 依照板規    32     4 4.835524e-05 4.069590e-05
#> 44:     規 相關板 相關板規    25     4 3.777753e-05 3.179367e-05
#> 45:     廠 深澳電 深澳電廠    23     4 3.475533e-05 3.052192e-05
#> 46:     群 客原族 客原族群    21     4 3.173313e-05 2.670668e-05
#> 47:     刪 也會被 也會被刪    32     4 4.835524e-05 5.214162e-05
#> 48:     格 看板嚴 看板嚴格    25     4 3.777753e-05 3.179367e-05
#> 49:     源 媒體來 媒體來源   145     4 2.191097e-04 1.869468e-04
#> 50:     則 張貼一 張貼一則    43     4 6.497736e-05 5.468512e-05
#> 51:     亞 馬來西 馬來西亞    83     4 1.254214e-04 1.080985e-04
#> 52:     樂 生日快 生日快樂   282     4 4.261306e-04 3.586326e-04
#> 53:     親 岸一家 岸一家親   221     4 3.339534e-04 3.052192e-04
#> 54:     值 台灣價 台灣價值   423     4 6.391959e-04 5.417642e-04
#> 55:     聞 完整新 完整新聞   424     4 6.407070e-04 5.392207e-04
#> 56:     嘔 嘔嘔嘔 嘔嘔嘔嘔   740     4 1.118215e-03 1.159833e-03
#> 57:     哈 哈哈哈 哈哈哈哈   312     4 4.714636e-04 7.096348e-04
#> 58:      d    ddd     dddd  1531     4 2.313496e-03 2.416319e-03
#> 59:      d    xdd     xddd   369     4 5.575964e-04 6.244277e-04
#> 60: 讀相關     閱 閱讀相關    25     4 3.777753e-05 7.656353e-05
#> 61: 台支那     滯 滯台支那    33     4 4.986634e-05 7.843093e-05
#> 62: 體中文     繁 繁體中文    48     4 7.253286e-05 8.216574e-05
#> 63: 合報導     綜 綜合報導    70     4 1.057771e-04 1.073757e-04
#> 64: 都不能     稿 稿都不能    32     4 4.835524e-05 7.843093e-05
#> 65: 騙集團     詐 詐騙集團    40     4 6.044405e-05 1.260497e-04
#> 66: 客原族     閩 閩客原族    21     4 3.173313e-05 1.279171e-04
#> 67: 存在感     刷 刷存在感    34     4 5.137744e-05 1.120442e-04
#> 68: 止政治     禁 禁止政治    25     4 3.777753e-05 1.456574e-04
#> 69: 閱讀相     細 細閱讀相    25     4 3.777753e-05 1.465911e-04
#> 70: 一支持     唯 唯一支持    53     4 8.008837e-05 1.493923e-04
#> 71: 結構常     細 細結構常    34     4 5.137744e-05 1.465911e-04
#> 72: 果日報     蘋 蘋果日報    61     4 9.217718e-05 1.605967e-04
#> 73: 方便網     址 址方便網    32     4 4.835524e-05 1.848729e-04
#> 74: 政治問     止 止政治問    25     4 3.777753e-05 1.914088e-04
#> 75: 告也會     廣 廣告也會    32     4 4.835524e-05 2.250221e-04
#> 76: 重者以     嚴 嚴重者以    25     4 3.777753e-05 2.324917e-04
#> 77: 有專板     否 否有專板    25     4 3.777753e-05 2.427624e-04
#> 78: 相關板     讀 讀相關板    25     4 3.777753e-05 2.605027e-04
#> 79: 請刪除     醒 醒請刪除    25     4 3.777753e-05 2.810442e-04
#> 80: 也算兩     刪 刪也算兩    25     4 3.777753e-05 2.857127e-04
#> 81: 如蘋果     例 例如蘋果    34     4 5.137744e-05 2.857127e-04
#> 82:   除ct     刪   刪除ct    25     4 3.777753e-05 2.857127e-04
#> 83: 網址方     短 短網址方    32     4 4.835524e-05 2.997182e-04
#> 84: 摩新聞     奇 奇摩新聞    37     4 5.591075e-05 3.230607e-04
#> 85: 曲棍球     吞 吞曲棍球    24     4 3.626643e-05 1.185801e-04
#> 86: 刪除文     規 規刪除文    32     4 4.835524e-05 3.380000e-04
#> 87: 者刪除     違 違者刪除    32     4 4.835524e-05 3.650773e-04
#> 88: 禁止政     格 格禁止政    25     4 3.777753e-05 3.678784e-04
#> 89:    apa      j     japa    63     4 9.519938e-05 3.641436e-04
#> 90: 岸一家     兩 兩岸一家   238     4 3.596421e-04 1.340795e-03
#>         w2     w1     word     N ngram            p           p1
#>               p2      pmi
#>            <num>    <num>
#>  1: 7.096132e-05 9.685001
#>  2: 7.656353e-05 9.649837
#>  3: 7.843093e-05 9.625739
#>  4: 9.056905e-05 9.481845
#>  5: 8.403314e-05 9.254466
#>  6: 1.139116e-04 9.252535
#>  7: 1.176464e-04 9.146166
#>  8: 1.269834e-04 9.097381
#>  9: 1.363204e-04 9.072950
#> 10: 1.456574e-04 9.006700
#> 11: 1.465911e-04 9.000310
#> 12: 1.493923e-04 8.904421
#> 13: 1.690000e-04 8.858059
#> 14: 1.848729e-04 8.768289
#> 15: 1.848729e-04 8.768289
#> 16: 1.914088e-04 8.733546
#> 17: 1.101768e-04 8.732486
#> 18: 2.184862e-04 8.601235
#> 19: 2.231547e-04 8.580093
#> 20: 2.240884e-04 8.575917
#> 21: 2.240884e-04 8.575917
#> 22: 2.240884e-04 8.575917
#> 23: 2.240884e-04 8.575917
#> 24: 2.250221e-04 8.571759
#> 25: 2.250221e-04 8.571759
#> 26: 2.231547e-04 8.542352
#> 27: 2.324917e-04 8.539103
#> 28: 2.427624e-04 8.495875
#> 29: 1.540608e-04 8.492303
#> 30: 2.483646e-04 8.442288
#> 31: 2.605027e-04 8.425344
#> 32: 2.605027e-04 8.425344
#> 33: 2.530331e-04 8.363466
#> 34: 2.810442e-04 8.349446
#> 35: 2.857127e-04 8.332971
#> 36: 2.857127e-04 8.332971
#> 37: 2.857127e-04 8.332971
#> 38: 2.857127e-04 8.332971
#> 39: 2.885138e-04 8.323215
#> 40: 2.997182e-04 8.285115
#> 41: 3.258619e-04 8.201484
#> 42: 3.323978e-04 8.181626
#> 43: 3.380000e-04 8.164912
#> 44: 3.380000e-04 8.164912
#> 45: 3.351989e-04 8.130674
#> 46: 3.520055e-04 8.124311
#> 47: 2.857127e-04 8.085135
#> 48: 3.678784e-04 8.080205
#> 49: 3.678784e-04 8.066506
#> 50: 3.734806e-04 8.065092
#> 51: 3.837514e-04 8.014152
#> 52: 6.293149e-04 7.543326
#> 53: 6.554585e-04 7.420147
#> 54: 7.301546e-04 7.387634
#> 55: 9.757182e-04 7.104784
#> 56: 1.198873e-03 6.689831
#> 57: 1.505127e-03 6.089970
#> 58: 4.940215e-03 5.266861
#> 59: 4.940215e-03 5.197146
#> 60: 3.179367e-05 9.649837
#> 61: 4.196765e-05 9.625739
#> 62: 6.613084e-05 9.499177
#> 63: 8.902228e-05 9.311624
#> 64: 5.977210e-05 9.241328
#> 65: 5.086987e-05 9.151281
#> 66: 2.670668e-05 9.136575
#> 67: 5.214162e-05 9.081853
#> 68: 3.179367e-05 9.006700
#> 69: 3.179367e-05 9.000310
#> 70: 6.740258e-05 8.981382
#> 71: 4.451114e-05 8.971323
#> 72: 7.757656e-05 8.909062
#> 73: 4.069590e-05 8.768289
#> 74: 3.433717e-05 8.656585
#> 75: 4.069590e-05 8.571759
#> 76: 3.179367e-05 8.539103
#> 77: 3.306542e-05 8.456654
#> 78: 3.179367e-05 8.425344
#> 79: 3.179367e-05 8.349446
#> 80: 3.179367e-05 8.332971
#> 81: 4.323939e-05 8.332971
#> 82: 3.179367e-05 8.332971
#> 83: 4.069590e-05 8.285115
#> 84: 4.705463e-05 8.210117
#> 85: 8.393529e-05 8.200768
#> 86: 4.069590e-05 8.164912
#> 87: 4.069590e-05 8.087849
#> 88: 3.179367e-05 8.080205
#> 89: 8.393529e-05 8.043889
#> 90: 3.052192e-04 6.778571
#>               p2      pmi

Add new terms into user dictionary

bi_wc_final[, .(word, N)] %>%
  write.table("dict/user_dict_utf8.txt", quote = FALSE,
              append = FALSE,
              sep = " ",
              row.names = FALSE, col.names = FALSE)
tri_wc_final[, .(word, N)] %>%
  write.table("dict/user_dict_utf8.txt", quote = FALSE,
              append = TRUE,
              sep = " ",
              row.names = FALSE, col.names = FALSE)
four_wc_final[, .(word, N)] %>%
  write.table("dict/user_dict_utf8.txt", quote = FALSE,
              append = TRUE,
              sep = " ",
              row.names = FALSE, col.names = FALSE)

mix_seg <- worker(type = "mix",
                  dict = "dict/jieba_dict_utf8.txt",
                  stop_word = "dict/stop_utf8.txt",
                  user = "dict/user_dict_utf8.txt",
                  symbol = FALSE,
                  encoding = "UTF-8")

# tokenize
text_seg <- dt[, paste(post_text, push_text)] %>% lapply(cutter, mix_seg)
segment("中華電信499吃到飽之亂引爆戰火", mix_seg)
#> [1] "中華"   "電信"   "499"    "吃到飽" "之亂"   "引爆"   "戰火"

Word Embedding

Word Count

# text2vec
text_token <- itoken(text_seg)
vocab <- create_vocabulary(
  text_token, 
  ngram=c(1L, 2L),
  sep_ngram = "_"
)

pruned_vocab <- prune_vocabulary(
  vocab, 
  term_count_min = 10, 
  doc_proportion_min = 0.001, 
  doc_proportion_max = 0.9,
  vocab_term_max = 20000
)
# class(pruned_vocab) <- c("text2vec_vocabulary", "data.table", "data.frame")
pruned_vocab <- pruned_vocab[str_length(pruned_vocab$term) >= 2,] # remove 1-word term

# Make DTM
vectorizer <- vocab_vectorizer(pruned_vocab)
dtm <- create_dtm(text_token, vectorizer)

# Check most freq terms
Matrix::colSums(dtm) %>% sort(decreasing = T) %>% head(20)
#>   台灣   中國   一個   都是   五樓   的人 民進黨   應該   垃圾   美國 
#>   3008   1604   1331    963    962    865    846    842    811    808 
#> 柯文哲   根本   支持   也是   日本   台北   市長   不用   柯粉   總統 
#>    741    713    711    709    686    604    603    533    532    527

word2vec

wordVectors

# Prepare tokenizes text file
tokenize_text_lines <- text_seg %>%
  sapply(paste, collapse = " ")  # Tokens are split on spaces.
tokenize_text_lines %>% writeLines("dataset/tokenize_text_lines.txt")

# Fit models
vector_set <- train_word2vec(train_file = "dataset/tokenize_text_lines.txt",
                          output_file = "dataset/ptt_gossiping_word2vec.bin",
                          force = TRUE,
                          vectors = 200,
                          threads = parallel::detectCores()-1,
                          window = 6)

vector_set <- vector_set[-1,]

相近關聯詞

nearest_to(vector_set, vector_set[["柯文哲"]], n = 20)
#>       柯文哲         柯文         哲為         但柯         輸誠 
#> 8.881784e-16 2.163745e-01 2.461457e-01 2.645223e-01 2.653944e-01 
#>         表態         柯說         墨綠         綠營     脫口而出 
#> 2.726296e-01 2.801413e-01 2.809439e-01 2.880358e-01 2.892135e-01 
#>         直言         民進         氣勢         勇弟       陳景峻 
#> 2.896340e-01 2.899235e-01 2.918302e-01 2.939741e-01 2.951965e-01 
#>         修補       選對會         專訪         政見         致歉 
#> 3.020913e-01 3.021007e-01 3.068234e-01 3.087859e-01 3.103285e-01
nearest_to(vector_set, vector_set[["499"]], n = 20)
#>          499       吃到飽          399          183          299 
#> 7.771561e-16 1.576027e-01 1.827954e-01 1.962149e-01 1.985694e-01 
#>         資費         不限           NP         限時          699 
#> 2.024386e-01 2.029390e-01 2.070329e-01 2.101635e-01 2.125551e-01 
#>         限速         剛出         退傭         方案         之亂 
#> 2.150124e-01 2.222288e-01 2.267977e-01 2.270077e-01 2.274960e-01 
#>         搶辦         月繳         種花          599     中華電信 
#> 2.278977e-01 2.295549e-01 2.307604e-01 2.310141e-01 2.311899e-01
nearest_to(vector_set, vector_set[["台灣價值"]], n = 20)
#>   台灣     說     會     沒   中國   一個     喔     柯     想     推 
#>    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN 
#>     幹   新聞   五樓 民進黨   應該     年   垃圾     做   美國   支持 
#>    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN

Visualisation: Clustering

Dimension reduction with t-SNE

# tsne_vec <- tsne(vector_set, k = 2, epoch = 10)
fit_pca <- prcomp(vector_set, scale = FALSE)
fviz_eig(fit_pca)


dt_pca <- fit_pca$x %>% as.data.table(keep.rownames = FALSE)
dt_pca[, word := rownames(fit_pca$x)]
db <- dbscan(dt_pca[, .(PC1, PC2, PC3, PC4, PC5)], eps = .4, minPts = 4)

dt_pca_2 <- dt_pca[, .(word, PC1, PC2, cluster = db$cluster)]

Plotly

p <- plot_ly(
  data = dt_pca_2, x = ~PC1, y = ~PC2, 
  # color = ~cluster,
  # Hover text:
  text = ~paste(word)
)
p
#> No trace type specified:
#>   Based on info supplied, a 'scatter' trace seems appropriate.
#>   Read more about this trace type -> https://plot.ly/r/reference/#scatter
#> No scatter mode specifed:
#>   Setting the mode to markers
#>   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

Keyword Extraction

# 利用 tf-idf 關鍵詞算法,處理高頻詞高估及低頻詞低估的問題,取得整個文檔的關鍵詞

# tf-idf
tfidf = TfIdf$new() # define tfidf model
# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm, tfidf)
# tfidf modified by fit_transform() call!

# Key term
key_term <- dtm_train_tfidf %>% 
  find_freq_terms(lowfreq = 0.05) %>% 
  Matrix::colSums(.) %>% 
  data.frame() %>% 
  data.table(keep.rownames = TRUE) %>% 
  setnames(c("keyword", "sum_tf_idf")) %>% 
  .[order(-sum_tf_idf)]
# key_term %>% head(100) %>% DT::datatable(extensions = "Responsive")

# Wordcloud
d <- key_term %>% head(200)
ncolor <- nrow(d)
getPalette = colorRampPalette(RColorBrewer::brewer.pal(8, "Set2"))
wordcloud2(d,
           size = 0.5,
           fontFamily = "Noto Sans CJK TC", 
           fontWeight = "normal",
           rotateRatio = 0,
           color = getPalette(ncolor),
shape = "circle")

References