Seke Blog: auto recursive indexing of chinese articles for later query use


/*
  Indexer.java
     本程式利用 mmseg4j.jar 套件作中文斷詞，可從給定目錄自動對所有以下各層文字檔案編製索引。
     索引表以物件串流存到硬碟invert.dat檔案，下回可以自動復原，方便檢索某詞出現在哪些檔案。

> javac -cp mmseg4j.jar;. Indexer.java
> java -cp mmseg4j.jar;. Indexer \data

init: path=\data
chars loaded time=110ms, line=13060, on file=\data\chars.dic
words loaded time=125ms, line=137450, on file=!/data/words.dic
unit loaded time=0ms, line=22, on file=file:\mmseg4j.jar!\data\units.dic
\data\L1\F1.txt: [，:41,的:24,沈船:17,計畫:8,澎湖:8,　:7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]

\data\L1\F1.txt: [，:41,的:24,沈船:17,計畫:8,澎湖:8,　:7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]

\data\L1\F2.txt: [，:88,的:74,、:31,　:25,。:25,在:16,海底:16,尋:11,寶:11,﹁:10,沈船:10]

\data\L1\F3.txt: [，:29,的:14,沈船:9,打撈:8,澎湖:6,　:5,。:5,工作:5,進行:5,館:5,後:5,古:5]

\data\L1\F4.txt: [，:21,。:13,的:11,船:7,、:6,去年:6,工作:6,澎湖:6,探勘:6,初:5,進行:5,沉:5,包括:5,勘:5,博:5,將軍:5,史:5]

\data\L1\L2\E1.txt: [，:51,的:16,與:9,。:7,主:6,老街:6,　:5,做:5,拆:5,三峽:4]

\data\L1\L2\E2.txt: [，:49,的:26,三峽:11,老街:10,。:8,與:7,古蹟:7,、:6,文化:6,而:5,祖師廟:5,保留:5]

\data\L1\L2\E3.txt: [，:36,的:14,。:13,三峽:13,「:7,」:7,主:7,老街:7,協調會:5,發展:5]

\data\L1\L2\E4.txt: [，:53,的:19,。:8,三峽:6,主:6,　:5,不:5,拆除:5,在:5,而:4,老街:4,財產:4,住戶:4,改建:4,古蹟:4,保留:4,排除:4,派:4,介入:4]

\data\L1\L2\E5.txt: [，:30,　:18,。:10,三峽:10,老街:7,文建會:7,立:7,的:5,派:5,面:5,騎樓:5]

\data\L1\L2\E6.txt: [，:52,的:17,。:9,民眾:8,老街:7,三峽:6,拆:6,　:5,而:5,文建會:5]

\data\L1\L2\E7.txt: [，:27,老街:12,。:7,屋:6,街:6,　:5,的:5,、:4,與:4,三峽:4,是:4,住戶:4,古蹟:4]

\data\L1\L2\L3\D1.txt: [，:47,「:35,」:35,的:29,、:20,布袋戲:15,。:14,宛然:13,祿:10,天:9,李:9]

\data\L1\L2\L3\D2.txt: [，:23,「:17,」:17,的:14,、:12,。:8,壇:8,藝術:7,儀式:5,　:4,主:4,－:4,露天劇場:4,開荒:4,啟用:4]

\data\L1\L2\L3\D3.txt: [，:52,畫:20,的:18,作:17,館:14,。:10,「:10,」:10,資料:10,這些:10]

\data\L1\L2\L3\D4.txt: [，:28,。:12,她:11,、:6,「:6,」:6,貝:6,文:6,王:6,音樂:5,的:5]

\data\L1\L2\M3\C1.txt: [，:27,的:22,」:18,「:17,。:11,中:8,柴可夫斯基:7,盛:7,能:7,余:7]

\data\L1\L2\M3\C2.txt: [，:38,的:27,舞:20,。:19,「:18,」:17,德國:11,能:11,團:11,、:10,舞蹈:10]

save and load 'invert.dat'

Input the query word:
三峽
query '三峽' occurs in articles [\data\L1\L2\E1.txt, \data\L1\L2\E2.txt, \data\L1\L2\E3.txt,\data\L1\L2\E4.txt, \data\L1\L2\E5.txt, \data\L1\L2\E6.txt, \data\L1\L2\E7.txt]

*/
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.io.IOException;

import java.util.Set;
import java.util.Map;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Scanner;

import com.chenlb.mmseg4j.example.Simple;

public class Indexer
{
  private static Simple segmenter;
  private static HashMap<String, ArrayList<String>> invertList;


  public static void statArticle(File file, HashMap<String, Integer> frequency)
    throws FileNotFoundException,UnsupportedEncodingException,IOException
  {
    FileInputStream fis = new FileInputStream(file);  // FileNotFoundException
    InputStreamReader isr = new InputStreamReader(fis, "big5"); //
    BufferedReader br = new BufferedReader(isr);

    // fill hashmap frequency

    while(br.ready()==true)  // IOException
    {
      String text = br.readLine();
      text = text.replaceAll(" ", "");  // remove interference from blanks
      String seg_text = segmenter.segWords(text, " ");
        //System.out.println(text);
       //System.out.println(seg_text);
      String words[] = seg_text.split(" ");

      for(String w : words)
      {
        //System.out.println(w.length() + ":" + w);
        if(w.length()==0) continue;

        if(frequency.containsKey(w)==true)
        {
          int count = frequency.get(w);
          count++;
          frequency.put(w,count);
        }
        else
        {
          frequency.put(w,1);
        }
      }
    }
    br.close();

    // process hashmap frequency

    int numberWords = frequency.size();
/*
    Collection<Integer> counts = frequency.values();
    List<Integer> counts_list = new ArrayList<>(counts);
    Collections.sort(counts_list);

    int count_threshold = counts_list.get(counts_list.size() - 10);

    String fullName = file.getCanonicalPath();
    System.out.printf("%s: [", fullName);
    for(String word : frequency.keySet())
    {
      int count = frequency.get(word);

      if(count >= count_threshold)
      {
        System.out.printf(",%s:%d", word, count);
      }
    }
    System.out.printf("]\n\n");
*/
    Set<Map.Entry<String,Integer>> entries = frequency.entrySet();
    List<Map.Entry<String,Integer>> entryList = new ArrayList<Map.Entry<String,Integer>>(entries);
    Comparator<Map.Entry<String,Integer>> cmp = new Comparator<Map.Entry<String,Integer>>() {
     public int compare(Map.Entry<String,Integer> e1, Map.Entry<String,Integer> e2)
     {
       return e2.getValue() - e1.getValue();  // descending order
     }
    };
    Collections.sort(entryList, cmp);

    int count_threshold = entryList.get(9).getValue();

    String fullName = file.getCanonicalPath();
    System.out.printf("%s: [", fullName);
    boolean first=true;
    for(Map.Entry<String,Integer> e : entryList)
    {
      if(e.getValue() >= count_threshold)
      {
        System.out.printf("%s%s:%d", (first==false)?",":"", e.getKey(), e.getValue());
       if(first==true) first = false;
      }
    }
    System.out.printf("]\n\n");
  }


  public static void indexArticleWords(File dirFile,
    HashMap<String, ArrayList<String>> invertList)
    throws FileNotFoundException,UnsupportedEncodingException,IOException
  {

    File list[] = dirFile.listFiles();
    String fullName;

    for(File f : list)
    {
      fullName = f.getCanonicalPath(); // throws IOException

      if(f.isFile()==true)
      {
       HashMap<String, Integer> frequency
         = new HashMap<String, Integer>();

        statArticle(f, frequency);

        for(String word : frequency.keySet())
        {
          if(invertList.containsKey(word)==true)
          {
            ArrayList<String> oldList = invertList.get(word);
            oldList.add(fullName);
          }
          else
          {
            ArrayList<String> newList = new ArrayList<>();
            newList.add(fullName);
            invertList.put(word, newList);
          }
        }
      }

      else if(f.isDirectory()==true)
      {
       indexArticleWords(f, invertList);
      }
    }
  }



  @SuppressWarnings("unchecked")
  public static void main(String args[])
    throws FileNotFoundException,UnsupportedEncodingException,
           IOException,ClassNotFoundException
  {
     // set up root folder
     String rootName = "data/";
     String testName = "data/L1/F1.txt";
     File root;

     if(args.length==1)
     {
       rootName = args[0];
     }

     root = new File(rootName);

     // set up segmenter and hashmap frequency
     segmenter = new Simple();

     HashMap<String, Integer> frequency = new HashMap<>();

     statArticle(new File(testName), frequency);  // 測試文章斷詞

     if(new File("invert.dat").exists()==false)
     {
       // set up invertList
       invertList = new HashMap<String, ArrayList<String>>();

       indexArticleWords(root, invertList);

       //save invertList to file 'invert.dat' and load from file 'invert.dat'

       ObjectOutputStream oos = new ObjectOutputStream(
       new FileOutputStream("invert.dat"));
       oos.writeObject(invertList);
       oos.close();
     }


     ObjectInputStream ois = new ObjectInputStream(
      new FileInputStream("invert.dat"));
     HashMap<String, ArrayList<String>>
       invertList2 = (HashMap<String, ArrayList<String>>) ois.readObject();
     ois.close();

     // do the query test on the loaded invertList
     System.out.println("save and load 'invert.dat'\n");

     System.out.print("Input the query word: ");
     Scanner sc = new Scanner(System.in);
     String query = sc.next();

     //print the file paths which have the query word
     ArrayList<String> list = invertList2.get(query);
     System.out.printf("query '%s' occurs in articles %s\n", query, list);

  }
}
Seke Blog

auto recursive indexing of chinese articles for later query use

沒有留言:

Building a Lightweight Streamlit Client for Local Ollama LLM Interaction

總網頁瀏覽量