-
- /*
- Indexer.java
- 本程式利用 mmseg4j.jar 套件作中文斷詞,可從給定目錄自動對所有以下各層文字檔案編製索引。
- 索引表以物件串流存到硬碟invert.dat檔案,下回可以自動復原,方便檢索某詞出現在哪些檔案。
-
- > javac -cp mmseg4j.jar;. Indexer.java
- > java -cp mmseg4j.jar;. Indexer \data
-
- init: path=\data
- chars loaded time=110ms, line=13060, on file=\data\chars.dic
- words loaded time=125ms, line=137450, on file=!/data/words.dic
- unit loaded time=0ms, line=22, on file=file:\mmseg4j.jar!\data\units.dic
- \data\L1\F1.txt: [,:41,的:24,沈船:17,計畫:8,澎湖:8, :7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]
-
- \data\L1\F1.txt: [,:41,的:24,沈船:17,計畫:8,澎湖:8, :7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]
-
- \data\L1\F2.txt: [,:88,的:74,、:31, :25,。:25,在:16,海底:16,尋:11,寶:11,﹁:10,沈船:10]
-
- \data\L1\F3.txt: [,:29,的:14,沈船:9,打撈:8,澎湖:6, :5,。:5,工作:5,進行:5,館:5,後:5,古:5]
-
- \data\L1\F4.txt: [,:21,。:13,的:11,船:7,、:6,去年:6,工作:6,澎湖:6,探勘:6,初:5,進行:5,沉:5,包括:5,勘:5,博:5,將軍:5,史:5]
-
- \data\L1\L2\E1.txt: [,:51,的:16,與:9,。:7,主:6,老街:6, :5,做:5,拆:5,三峽:4]
-
- \data\L1\L2\E2.txt: [,:49,的:26,三峽:11,老街:10,。:8,與:7,古蹟:7,、:6,文化:6,而:5,祖師廟:5,保留:5]
-
- \data\L1\L2\E3.txt: [,:36,的:14,。:13,三峽:13,「:7,」:7,主:7,老街:7,協調會:5,發展:5]
-
- \data\L1\L2\E4.txt: [,:53,的:19,。:8,三峽:6,主:6, :5,不:5,拆除:5,在:5,而:4,老街:4,財產:4,住戶:4,改建:4,古蹟:4,保留:4,排除:4,派:4,介入:4]
-
- \data\L1\L2\E5.txt: [,:30, :18,。:10,三峽:10,老街:7,文建會:7,立:7,的:5,派:5,面:5,騎樓:5]
-
- \data\L1\L2\E6.txt: [,:52,的:17,。:9,民眾:8,老街:7,三峽:6,拆:6, :5,而:5,文建會:5]
-
- \data\L1\L2\E7.txt: [,:27,老街:12,。:7,屋:6,街:6, :5,的:5,、:4,與:4,三峽:4,是:4,住戶:4,古蹟:4]
-
- \data\L1\L2\L3\D1.txt: [,:47,「:35,」:35,的:29,、:20,布袋戲:15,。:14,宛然:13,祿:10,天:9,李:9]
-
- \data\L1\L2\L3\D2.txt: [,:23,「:17,」:17,的:14,、:12,。:8,壇:8,藝術:7,儀式:5, :4,主:4,-:4,露天劇場:4,開荒:4,啟用:4]
-
- \data\L1\L2\L3\D3.txt: [,:52,畫:20,的:18,作:17,館:14,。:10,「:10,」:10,資料:10,這些:10]
-
- \data\L1\L2\L3\D4.txt: [,:28,。:12,她:11,、:6,「:6,」:6,貝:6,文:6,王:6,音樂:5,的:5]
-
- \data\L1\L2\M3\C1.txt: [,:27,的:22,」:18,「:17,。:11,中:8,柴可夫斯基:7,盛:7,能:7,余:7]
-
- \data\L1\L2\M3\C2.txt: [,:38,的:27,舞:20,。:19,「:18,」:17,德國:11,能:11,團:11,、:10,舞蹈:10]
-
- save and load 'invert.dat'
-
- Input the query word:
- 三峽
- query '三峽' occurs in articles [\data\L1\L2\E1.txt, \data\L1\L2\E2.txt, \data\L1\L2\E3.txt,\data\L1\L2\E4.txt, \data\L1\L2\E5.txt, \data\L1\L2\E6.txt, \data\L1\L2\E7.txt]
-
- */
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.InputStreamReader;
- import java.io.BufferedReader;
- import java.io.ObjectInputStream;
- import java.io.ObjectOutputStream;
- import java.io.FileNotFoundException;
- import java.io.UnsupportedEncodingException;
- import java.io.IOException;
-
- import java.util.Set;
- import java.util.Map;
- import java.util.HashMap;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Collection;
- import java.util.Collections;
- import java.util.Comparator;
- import java.util.Scanner;
-
- import com.chenlb.mmseg4j.example.Simple;
-
- public class Indexer
- {
- private static Simple segmenter;
- private static HashMap<String, ArrayList<String>> invertList;
-
-
- public static void statArticle(File file, HashMap<String, Integer> frequency)
- throws FileNotFoundException,UnsupportedEncodingException,IOException
- {
- FileInputStream fis = new FileInputStream(file); // FileNotFoundException
- InputStreamReader isr = new InputStreamReader(fis, "big5"); //
- BufferedReader br = new BufferedReader(isr);
-
- // fill hashmap frequency
-
- while(br.ready()==true) // IOException
- {
- String text = br.readLine();
- text = text.replaceAll(" ", ""); // remove interference from blanks
- String seg_text = segmenter.segWords(text, " ");
- //System.out.println(text);
- //System.out.println(seg_text);
- String words[] = seg_text.split(" ");
-
- for(String w : words)
- {
- //System.out.println(w.length() + ":" + w);
- if(w.length()==0) continue;
-
- if(frequency.containsKey(w)==true)
- {
- int count = frequency.get(w);
- count++;
- frequency.put(w,count);
- }
- else
- {
- frequency.put(w,1);
- }
- }
- }
- br.close();
-
- // process hashmap frequency
-
- int numberWords = frequency.size();
- /*
- Collection<Integer> counts = frequency.values();
- List<Integer> counts_list = new ArrayList<>(counts);
- Collections.sort(counts_list);
-
- int count_threshold = counts_list.get(counts_list.size() - 10);
-
- String fullName = file.getCanonicalPath();
- System.out.printf("%s: [", fullName);
- for(String word : frequency.keySet())
- {
- int count = frequency.get(word);
-
- if(count >= count_threshold)
- {
- System.out.printf(",%s:%d", word, count);
- }
- }
- System.out.printf("]\n\n");
- */
- Set<Map.Entry<String,Integer>> entries = frequency.entrySet();
- List<Map.Entry<String,Integer>> entryList = new ArrayList<Map.Entry<String,Integer>>(entries);
- Comparator<Map.Entry<String,Integer>> cmp = new Comparator<Map.Entry<String,Integer>>() {
- public int compare(Map.Entry<String,Integer> e1, Map.Entry<String,Integer> e2)
- {
- return e2.getValue() - e1.getValue(); // descending order
- }
- };
- Collections.sort(entryList, cmp);
-
- int count_threshold = entryList.get(9).getValue();
-
- String fullName = file.getCanonicalPath();
- System.out.printf("%s: [", fullName);
- boolean first=true;
- for(Map.Entry<String,Integer> e : entryList)
- {
- if(e.getValue() >= count_threshold)
- {
- System.out.printf("%s%s:%d", (first==false)?",":"", e.getKey(), e.getValue());
- if(first==true) first = false;
- }
- }
- System.out.printf("]\n\n");
- }
-
-
- public static void indexArticleWords(File dirFile,
- HashMap<String, ArrayList<String>> invertList)
- throws FileNotFoundException,UnsupportedEncodingException,IOException
- {
-
- File list[] = dirFile.listFiles();
- String fullName;
-
- for(File f : list)
- {
- fullName = f.getCanonicalPath(); // throws IOException
-
- if(f.isFile()==true)
- {
- HashMap<String, Integer> frequency
- = new HashMap<String, Integer>();
-
- statArticle(f, frequency);
-
- for(String word : frequency.keySet())
- {
- if(invertList.containsKey(word)==true)
- {
- ArrayList<String> oldList = invertList.get(word);
- oldList.add(fullName);
- }
- else
- {
- ArrayList<String> newList = new ArrayList<>();
- newList.add(fullName);
- invertList.put(word, newList);
- }
- }
- }
-
- else if(f.isDirectory()==true)
- {
- indexArticleWords(f, invertList);
- }
- }
- }
-
-
-
- @SuppressWarnings("unchecked")
- public static void main(String args[])
- throws FileNotFoundException,UnsupportedEncodingException,
- IOException,ClassNotFoundException
- {
- // set up root folder
- String rootName = "data/";
- String testName = "data/L1/F1.txt";
- File root;
-
- if(args.length==1)
- {
- rootName = args[0];
- }
-
- root = new File(rootName);
-
- // set up segmenter and hashmap frequency
- segmenter = new Simple();
-
- HashMap<String, Integer> frequency = new HashMap<>();
-
- statArticle(new File(testName), frequency); // 測試文章斷詞
-
- if(new File("invert.dat").exists()==false)
- {
- // set up invertList
- invertList = new HashMap<String, ArrayList<String>>();
-
- indexArticleWords(root, invertList);
-
- //save invertList to file 'invert.dat' and load from file 'invert.dat'
-
- ObjectOutputStream oos = new ObjectOutputStream(
- new FileOutputStream("invert.dat"));
- oos.writeObject(invertList);
- oos.close();
- }
-
-
- ObjectInputStream ois = new ObjectInputStream(
- new FileInputStream("invert.dat"));
- HashMap<String, ArrayList<String>>
- invertList2 = (HashMap<String, ArrayList<String>>) ois.readObject();
- ois.close();
-
- // do the query test on the loaded invertList
- System.out.println("save and load 'invert.dat'\n");
-
- System.out.print("Input the query word: ");
- Scanner sc = new Scanner(System.in);
- String query = sc.next();
-
- //print the file paths which have the query word
- ArrayList<String> list = invertList2.get(query);
- System.out.printf("query '%s' occurs in articles %s\n", query, list);
-
- }
- }
2015年11月17日 星期二
auto recursive indexing of chinese articles for later query use
訂閱:
張貼留言 (Atom)
沒有留言:
張貼留言