2015年11月17日 星期二

auto recursive indexing of chinese articles for later query use

  1.  
  2. /*
  3. Indexer.java
  4. 本程式利用 mmseg4j.jar 套件作中文斷詞,可從給定目錄自動對所有以下各層文字檔案編製索引。
  5. 索引表以物件串流存到硬碟invert.dat檔案,下回可以自動復原,方便檢索某詞出現在哪些檔案。
  6.  
  7. > javac -cp mmseg4j.jar;. Indexer.java
  8. > java -cp mmseg4j.jar;. Indexer \data
  9.  
  10. init: path=\data
  11. chars loaded time=110ms, line=13060, on file=\data\chars.dic
  12. words loaded time=125ms, line=137450, on file=!/data/words.dic
  13. unit loaded time=0ms, line=22, on file=file:\mmseg4j.jar!\data\units.dic
  14. \data\L1\F1.txt: [,:41,的:24,沈船:17,計畫:8,澎湖:8, :7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]
  15.  
  16. \data\L1\F1.txt: [,:41,的:24,沈船:17,計畫:8,澎湖:8, :7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]
  17.  
  18. \data\L1\F2.txt: [,:88,的:74,、:31, :25,。:25,在:16,海底:16,尋:11,寶:11,﹁:10,沈船:10]
  19.  
  20. \data\L1\F3.txt: [,:29,的:14,沈船:9,打撈:8,澎湖:6, :5,。:5,工作:5,進行:5,館:5,後:5,古:5]
  21.  
  22. \data\L1\F4.txt: [,:21,。:13,的:11,船:7,、:6,去年:6,工作:6,澎湖:6,探勘:6,初:5,進行:5,沉:5,包括:5,勘:5,博:5,將軍:5,史:5]
  23.  
  24. \data\L1\L2\E1.txt: [,:51,的:16,與:9,。:7,主:6,老街:6, :5,做:5,拆:5,三峽:4]
  25.  
  26. \data\L1\L2\E2.txt: [,:49,的:26,三峽:11,老街:10,。:8,與:7,古蹟:7,、:6,文化:6,而:5,祖師廟:5,保留:5]
  27.  
  28. \data\L1\L2\E3.txt: [,:36,的:14,。:13,三峽:13,「:7,」:7,主:7,老街:7,協調會:5,發展:5]
  29.  
  30. \data\L1\L2\E4.txt: [,:53,的:19,。:8,三峽:6,主:6, :5,不:5,拆除:5,在:5,而:4,老街:4,財產:4,住戶:4,改建:4,古蹟:4,保留:4,排除:4,派:4,介入:4]
  31.  
  32. \data\L1\L2\E5.txt: [,:30, :18,。:10,三峽:10,老街:7,文建會:7,立:7,的:5,派:5,面:5,騎樓:5]
  33.  
  34. \data\L1\L2\E6.txt: [,:52,的:17,。:9,民眾:8,老街:7,三峽:6,拆:6, :5,而:5,文建會:5]
  35.  
  36. \data\L1\L2\E7.txt: [,:27,老街:12,。:7,屋:6,街:6, :5,的:5,、:4,與:4,三峽:4,是:4,住戶:4,古蹟:4]
  37.  
  38. \data\L1\L2\L3\D1.txt: [,:47,「:35,」:35,的:29,、:20,布袋戲:15,。:14,宛然:13,祿:10,天:9,李:9]
  39.  
  40. \data\L1\L2\L3\D2.txt: [,:23,「:17,」:17,的:14,、:12,。:8,壇:8,藝術:7,儀式:5, :4,主:4,-:4,露天劇場:4,開荒:4,啟用:4]
  41.  
  42. \data\L1\L2\L3\D3.txt: [,:52,畫:20,的:18,作:17,館:14,。:10,「:10,」:10,資料:10,這些:10]
  43.  
  44. \data\L1\L2\L3\D4.txt: [,:28,。:12,她:11,、:6,「:6,」:6,貝:6,文:6,王:6,音樂:5,的:5]
  45.  
  46. \data\L1\L2\M3\C1.txt: [,:27,的:22,」:18,「:17,。:11,中:8,柴可夫斯基:7,盛:7,能:7,余:7]
  47.  
  48. \data\L1\L2\M3\C2.txt: [,:38,的:27,舞:20,。:19,「:18,」:17,德國:11,能:11,團:11,、:10,舞蹈:10]
  49.  
  50. save and load 'invert.dat'
  51.  
  52. Input the query word:
  53. 三峽
  54. query '三峽' occurs in articles [\data\L1\L2\E1.txt, \data\L1\L2\E2.txt, \data\L1\L2\E3.txt,\data\L1\L2\E4.txt, \data\L1\L2\E5.txt, \data\L1\L2\E6.txt, \data\L1\L2\E7.txt]
  55.  
  56. */
  57. import java.io.File;
  58. import java.io.FileInputStream;
  59. import java.io.FileOutputStream;
  60. import java.io.InputStreamReader;
  61. import java.io.BufferedReader;
  62. import java.io.ObjectInputStream;
  63. import java.io.ObjectOutputStream;
  64. import java.io.FileNotFoundException;
  65. import java.io.UnsupportedEncodingException;
  66. import java.io.IOException;
  67.  
  68. import java.util.Set;
  69. import java.util.Map;
  70. import java.util.HashMap;
  71. import java.util.ArrayList;
  72. import java.util.List;
  73. import java.util.Collection;
  74. import java.util.Collections;
  75. import java.util.Comparator;
  76. import java.util.Scanner;
  77.  
  78. import com.chenlb.mmseg4j.example.Simple;
  79.  
  80. public class Indexer
  81. {
  82. private static Simple segmenter;
  83. private static HashMap<String, ArrayList<String>> invertList;
  84.  
  85.  
  86. public static void statArticle(File file, HashMap<String, Integer> frequency)
  87. throws FileNotFoundException,UnsupportedEncodingException,IOException
  88. {
  89. FileInputStream fis = new FileInputStream(file); // FileNotFoundException
  90. InputStreamReader isr = new InputStreamReader(fis, "big5"); //
  91. BufferedReader br = new BufferedReader(isr);
  92.  
  93. // fill hashmap frequency
  94.  
  95. while(br.ready()==true) // IOException
  96. {
  97. String text = br.readLine();
  98. text = text.replaceAll(" ", ""); // remove interference from blanks
  99. String seg_text = segmenter.segWords(text, " ");
  100. //System.out.println(text);
  101. //System.out.println(seg_text);
  102. String words[] = seg_text.split(" ");
  103.  
  104. for(String w : words)
  105. {
  106. //System.out.println(w.length() + ":" + w);
  107. if(w.length()==0) continue;
  108.  
  109. if(frequency.containsKey(w)==true)
  110. {
  111. int count = frequency.get(w);
  112. count++;
  113. frequency.put(w,count);
  114. }
  115. else
  116. {
  117. frequency.put(w,1);
  118. }
  119. }
  120. }
  121. br.close();
  122.  
  123. // process hashmap frequency
  124.  
  125. int numberWords = frequency.size();
  126. /*
  127. Collection<Integer> counts = frequency.values();
  128. List<Integer> counts_list = new ArrayList<>(counts);
  129. Collections.sort(counts_list);
  130.  
  131. int count_threshold = counts_list.get(counts_list.size() - 10);
  132.  
  133. String fullName = file.getCanonicalPath();
  134. System.out.printf("%s: [", fullName);
  135. for(String word : frequency.keySet())
  136. {
  137. int count = frequency.get(word);
  138.  
  139. if(count >= count_threshold)
  140. {
  141. System.out.printf(",%s:%d", word, count);
  142. }
  143. }
  144. System.out.printf("]\n\n");
  145. */
  146. Set<Map.Entry<String,Integer>> entries = frequency.entrySet();
  147. List<Map.Entry<String,Integer>> entryList = new ArrayList<Map.Entry<String,Integer>>(entries);
  148. Comparator<Map.Entry<String,Integer>> cmp = new Comparator<Map.Entry<String,Integer>>() {
  149. public int compare(Map.Entry<String,Integer> e1, Map.Entry<String,Integer> e2)
  150. {
  151. return e2.getValue() - e1.getValue(); // descending order
  152. }
  153. };
  154. Collections.sort(entryList, cmp);
  155.  
  156. int count_threshold = entryList.get(9).getValue();
  157.  
  158. String fullName = file.getCanonicalPath();
  159. System.out.printf("%s: [", fullName);
  160. boolean first=true;
  161. for(Map.Entry<String,Integer> e : entryList)
  162. {
  163. if(e.getValue() >= count_threshold)
  164. {
  165. System.out.printf("%s%s:%d", (first==false)?",":"", e.getKey(), e.getValue());
  166. if(first==true) first = false;
  167. }
  168. }
  169. System.out.printf("]\n\n");
  170. }
  171.  
  172.  
  173. public static void indexArticleWords(File dirFile,
  174. HashMap<String, ArrayList<String>> invertList)
  175. throws FileNotFoundException,UnsupportedEncodingException,IOException
  176. {
  177.  
  178. File list[] = dirFile.listFiles();
  179. String fullName;
  180.  
  181. for(File f : list)
  182. {
  183. fullName = f.getCanonicalPath(); // throws IOException
  184.  
  185. if(f.isFile()==true)
  186. {
  187. HashMap<String, Integer> frequency
  188. = new HashMap<String, Integer>();
  189.  
  190. statArticle(f, frequency);
  191.  
  192. for(String word : frequency.keySet())
  193. {
  194. if(invertList.containsKey(word)==true)
  195. {
  196. ArrayList<String> oldList = invertList.get(word);
  197. oldList.add(fullName);
  198. }
  199. else
  200. {
  201. ArrayList<String> newList = new ArrayList<>();
  202. newList.add(fullName);
  203. invertList.put(word, newList);
  204. }
  205. }
  206. }
  207.  
  208. else if(f.isDirectory()==true)
  209. {
  210. indexArticleWords(f, invertList);
  211. }
  212. }
  213. }
  214.  
  215.  
  216.  
  217. @SuppressWarnings("unchecked")
  218. public static void main(String args[])
  219. throws FileNotFoundException,UnsupportedEncodingException,
  220. IOException,ClassNotFoundException
  221. {
  222. // set up root folder
  223. String rootName = "data/";
  224. String testName = "data/L1/F1.txt";
  225. File root;
  226.  
  227. if(args.length==1)
  228. {
  229. rootName = args[0];
  230. }
  231.  
  232. root = new File(rootName);
  233.  
  234. // set up segmenter and hashmap frequency
  235. segmenter = new Simple();
  236.  
  237. HashMap<String, Integer> frequency = new HashMap<>();
  238.  
  239. statArticle(new File(testName), frequency); // 測試文章斷詞
  240.  
  241. if(new File("invert.dat").exists()==false)
  242. {
  243. // set up invertList
  244. invertList = new HashMap<String, ArrayList<String>>();
  245.  
  246. indexArticleWords(root, invertList);
  247.  
  248. //save invertList to file 'invert.dat' and load from file 'invert.dat'
  249.  
  250. ObjectOutputStream oos = new ObjectOutputStream(
  251. new FileOutputStream("invert.dat"));
  252. oos.writeObject(invertList);
  253. oos.close();
  254. }
  255.  
  256.  
  257. ObjectInputStream ois = new ObjectInputStream(
  258. new FileInputStream("invert.dat"));
  259. HashMap<String, ArrayList<String>>
  260. invertList2 = (HashMap<String, ArrayList<String>>) ois.readObject();
  261. ois.close();
  262.  
  263. // do the query test on the loaded invertList
  264. System.out.println("save and load 'invert.dat'\n");
  265.  
  266. System.out.print("Input the query word: ");
  267. Scanner sc = new Scanner(System.in);
  268. String query = sc.next();
  269.  
  270. //print the file paths which have the query word
  271. ArrayList<String> list = invertList2.get(query);
  272. System.out.printf("query '%s' occurs in articles %s\n", query, list);
  273.  
  274. }
  275. }

沒有留言: