/*
Indexer.java
本程式利用 mmseg4j.jar 套件作中文斷詞,可從給定目錄自動對所有以下各層文字檔案編製索引。
索引表以物件串流存到硬碟invert.dat檔案,下回可以自動復原,方便檢索某詞出現在哪些檔案。
> javac -cp mmseg4j.jar;. Indexer.java
> java -cp mmseg4j.jar;. Indexer \data
init: path=\data
chars loaded time=110ms, line=13060, on file=\data\chars.dic
words loaded time=125ms, line=137450, on file=!/data/words.dic
unit loaded time=0ms, line=22, on file=file:\mmseg4j.jar!\data\units.dic
\data\L1\F1.txt: [,:41,的:24,沈船:17,計畫:8,澎湖:8, :7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]
\data\L1\F1.txt: [,:41,的:24,沈船:17,計畫:8,澎湖:8, :7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]
\data\L1\F2.txt: [,:88,的:74,、:31, :25,。:25,在:16,海底:16,尋:11,寶:11,﹁:10,沈船:10]
\data\L1\F3.txt: [,:29,的:14,沈船:9,打撈:8,澎湖:6, :5,。:5,工作:5,進行:5,館:5,後:5,古:5]
\data\L1\F4.txt: [,:21,。:13,的:11,船:7,、:6,去年:6,工作:6,澎湖:6,探勘:6,初:5,進行:5,沉:5,包括:5,勘:5,博:5,將軍:5,史:5]
\data\L1\L2\E1.txt: [,:51,的:16,與:9,。:7,主:6,老街:6, :5,做:5,拆:5,三峽:4]
\data\L1\L2\E2.txt: [,:49,的:26,三峽:11,老街:10,。:8,與:7,古蹟:7,、:6,文化:6,而:5,祖師廟:5,保留:5]
\data\L1\L2\E3.txt: [,:36,的:14,。:13,三峽:13,「:7,」:7,主:7,老街:7,協調會:5,發展:5]
\data\L1\L2\E4.txt: [,:53,的:19,。:8,三峽:6,主:6, :5,不:5,拆除:5,在:5,而:4,老街:4,財產:4,住戶:4,改建:4,古蹟:4,保留:4,排除:4,派:4,介入:4]
\data\L1\L2\E5.txt: [,:30, :18,。:10,三峽:10,老街:7,文建會:7,立:7,的:5,派:5,面:5,騎樓:5]
\data\L1\L2\E6.txt: [,:52,的:17,。:9,民眾:8,老街:7,三峽:6,拆:6, :5,而:5,文建會:5]
\data\L1\L2\E7.txt: [,:27,老街:12,。:7,屋:6,街:6, :5,的:5,、:4,與:4,三峽:4,是:4,住戶:4,古蹟:4]
\data\L1\L2\L3\D1.txt: [,:47,「:35,」:35,的:29,、:20,布袋戲:15,。:14,宛然:13,祿:10,天:9,李:9]
\data\L1\L2\L3\D2.txt: [,:23,「:17,」:17,的:14,、:12,。:8,壇:8,藝術:7,儀式:5, :4,主:4,-:4,露天劇場:4,開荒:4,啟用:4]
\data\L1\L2\L3\D3.txt: [,:52,畫:20,的:18,作:17,館:14,。:10,「:10,」:10,資料:10,這些:10]
\data\L1\L2\L3\D4.txt: [,:28,。:12,她:11,、:6,「:6,」:6,貝:6,文:6,王:6,音樂:5,的:5]
\data\L1\L2\M3\C1.txt: [,:27,的:22,」:18,「:17,。:11,中:8,柴可夫斯基:7,盛:7,能:7,余:7]
\data\L1\L2\M3\C2.txt: [,:38,的:27,舞:20,。:19,「:18,」:17,德國:11,能:11,團:11,、:10,舞蹈:10]
save and load 'invert.dat'
Input the query word:
三峽
query '三峽' occurs in articles [\data\L1\L2\E1.txt, \data\L1\L2\E2.txt, \data\L1\L2\E3.txt,\data\L1\L2\E4.txt, \data\L1\L2\E5.txt, \data\L1\L2\E6.txt, \data\L1\L2\E7.txt]
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.io.IOException;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Scanner;
import com.chenlb.mmseg4j.example.Simple;
public class Indexer
{
private static Simple segmenter;
private static HashMap<String, ArrayList<String>> invertList;
public static void statArticle(File file, HashMap<String, Integer> frequency)
throws FileNotFoundException,UnsupportedEncodingException,IOException
{
FileInputStream fis = new FileInputStream(file); // FileNotFoundException
InputStreamReader isr = new InputStreamReader(fis, "big5"); //
BufferedReader br = new BufferedReader(isr);
// fill hashmap frequency
while(br.ready()==true) // IOException
{
String text = br.readLine();
text = text.replaceAll(" ", ""); // remove interference from blanks
String seg_text = segmenter.segWords(text, " ");
//System.out.println(text);
//System.out.println(seg_text);
String words[] = seg_text.split(" ");
for(String w : words)
{
//System.out.println(w.length() + ":" + w);
if(w.length()==0) continue;
if(frequency.containsKey(w)==true)
{
int count = frequency.get(w);
count++;
frequency.put(w,count);
}
else
{
frequency.put(w,1);
}
}
}
br.close();
// process hashmap frequency
int numberWords = frequency.size();
/*
Collection<Integer> counts = frequency.values();
List<Integer> counts_list = new ArrayList<>(counts);
Collections.sort(counts_list);
int count_threshold = counts_list.get(counts_list.size() - 10);
String fullName = file.getCanonicalPath();
System.out.printf("%s: [", fullName);
for(String word : frequency.keySet())
{
int count = frequency.get(word);
if(count >= count_threshold)
{
System.out.printf(",%s:%d", word, count);
}
}
System.out.printf("]\n\n");
*/
Set<Map.Entry<String,Integer>> entries = frequency.entrySet();
List<Map.Entry<String,Integer>> entryList = new ArrayList<Map.Entry<String,Integer>>(entries);
Comparator<Map.Entry<String,Integer>> cmp = new Comparator<Map.Entry<String,Integer>>() {
public int compare(Map.Entry<String,Integer> e1, Map.Entry<String,Integer> e2)
{
return e2.getValue() - e1.getValue(); // descending order
}
};
Collections.sort(entryList, cmp);
int count_threshold = entryList.get(9).getValue();
String fullName = file.getCanonicalPath();
System.out.printf("%s: [", fullName);
boolean first=true;
for(Map.Entry<String,Integer> e : entryList)
{
if(e.getValue() >= count_threshold)
{
System.out.printf("%s%s:%d", (first==false)?",":"", e.getKey(), e.getValue());
if(first==true) first = false;
}
}
System.out.printf("]\n\n");
}
public static void indexArticleWords(File dirFile,
HashMap<String, ArrayList<String>> invertList)
throws FileNotFoundException,UnsupportedEncodingException,IOException
{
File list[] = dirFile.listFiles();
String fullName;
for(File f : list)
{
fullName = f.getCanonicalPath(); // throws IOException
if(f.isFile()==true)
{
HashMap<String, Integer> frequency
= new HashMap<String, Integer>();
statArticle(f, frequency);
for(String word : frequency.keySet())
{
if(invertList.containsKey(word)==true)
{
ArrayList<String> oldList = invertList.get(word);
oldList.add(fullName);
}
else
{
ArrayList<String> newList = new ArrayList<>();
newList.add(fullName);
invertList.put(word, newList);
}
}
}
else if(f.isDirectory()==true)
{
indexArticleWords(f, invertList);
}
}
}
@SuppressWarnings("unchecked")
public static void main(String args[])
throws FileNotFoundException,UnsupportedEncodingException,
IOException,ClassNotFoundException
{
// set up root folder
String rootName = "data/";
String testName = "data/L1/F1.txt";
File root;
if(args.length==1)
{
rootName = args[0];
}
root = new File(rootName);
// set up segmenter and hashmap frequency
segmenter = new Simple();
HashMap<String, Integer> frequency = new HashMap<>();
statArticle(new File(testName), frequency); // 測試文章斷詞
if(new File("invert.dat").exists()==false)
{
// set up invertList
invertList = new HashMap<String, ArrayList<String>>();
indexArticleWords(root, invertList);
//save invertList to file 'invert.dat' and load from file 'invert.dat'
ObjectOutputStream oos = new ObjectOutputStream(
new FileOutputStream("invert.dat"));
oos.writeObject(invertList);
oos.close();
}
ObjectInputStream ois = new ObjectInputStream(
new FileInputStream("invert.dat"));
HashMap<String, ArrayList<String>>
invertList2 = (HashMap<String, ArrayList<String>>) ois.readObject();
ois.close();
// do the query test on the loaded invertList
System.out.println("save and load 'invert.dat'\n");
System.out.print("Input the query word: ");
Scanner sc = new Scanner(System.in);
String query = sc.next();
//print the file paths which have the query word
ArrayList<String> list = invertList2.get(query);
System.out.printf("query '%s' occurs in articles %s\n", query, list);
}
}
2015年11月17日 星期二
auto recursive indexing of chinese articles for later query use
訂閱:
張貼留言 (Atom)
沒有留言:
張貼留言