Seke Blog: 11月 2015

auto recursive indexing of chinese articles for later query use


/*
  Indexer.java
     本程式利用 mmseg4j.jar 套件作中文斷詞，可從給定目錄自動對所有以下各層文字檔案編製索引。
     索引表以物件串流存到硬碟invert.dat檔案，下回可以自動復原，方便檢索某詞出現在哪些檔案。

> javac -cp mmseg4j.jar;. Indexer.java
> java -cp mmseg4j.jar;. Indexer \data

init: path=\data
chars loaded time=110ms, line=13060, on file=\data\chars.dic
words loaded time=125ms, line=137450, on file=!/data/words.dic
unit loaded time=0ms, line=22, on file=file:\mmseg4j.jar!\data\units.dic
\data\L1\F1.txt: [，:41,的:24,沈船:17,計畫:8,澎湖:8,　:7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]

\data\L1\F1.txt: [，:41,的:24,沈船:17,計畫:8,澎湖:8,　:7,。:7,發掘:7,為:7,初:6,進行:6,該:6,勘:6,古:6]

\data\L1\F2.txt: [，:88,的:74,、:31,　:25,。:25,在:16,海底:16,尋:11,寶:11,﹁:10,沈船:10]

\data\L1\F3.txt: [，:29,的:14,沈船:9,打撈:8,澎湖:6,　:5,。:5,工作:5,進行:5,館:5,後:5,古:5]

\data\L1\F4.txt: [，:21,。:13,的:11,船:7,、:6,去年:6,工作:6,澎湖:6,探勘:6,初:5,進行:5,沉:5,包括:5,勘:5,博:5,將軍:5,史:5]

\data\L1\L2\E1.txt: [，:51,的:16,與:9,。:7,主:6,老街:6,　:5,做:5,拆:5,三峽:4]

\data\L1\L2\E2.txt: [，:49,的:26,三峽:11,老街:10,。:8,與:7,古蹟:7,、:6,文化:6,而:5,祖師廟:5,保留:5]

\data\L1\L2\E3.txt: [，:36,的:14,。:13,三峽:13,「:7,」:7,主:7,老街:7,協調會:5,發展:5]

\data\L1\L2\E4.txt: [，:53,的:19,。:8,三峽:6,主:6,　:5,不:5,拆除:5,在:5,而:4,老街:4,財產:4,住戶:4,改建:4,古蹟:4,保留:4,排除:4,派:4,介入:4]

\data\L1\L2\E5.txt: [，:30,　:18,。:10,三峽:10,老街:7,文建會:7,立:7,的:5,派:5,面:5,騎樓:5]

\data\L1\L2\E6.txt: [，:52,的:17,。:9,民眾:8,老街:7,三峽:6,拆:6,　:5,而:5,文建會:5]

\data\L1\L2\E7.txt: [，:27,老街:12,。:7,屋:6,街:6,　:5,的:5,、:4,與:4,三峽:4,是:4,住戶:4,古蹟:4]

\data\L1\L2\L3\D1.txt: [，:47,「:35,」:35,的:29,、:20,布袋戲:15,。:14,宛然:13,祿:10,天:9,李:9]

\data\L1\L2\L3\D2.txt: [，:23,「:17,」:17,的:14,、:12,。:8,壇:8,藝術:7,儀式:5,　:4,主:4,－:4,露天劇場:4,開荒:4,啟用:4]

\data\L1\L2\L3\D3.txt: [，:52,畫:20,的:18,作:17,館:14,。:10,「:10,」:10,資料:10,這些:10]

\data\L1\L2\L3\D4.txt: [，:28,。:12,她:11,、:6,「:6,」:6,貝:6,文:6,王:6,音樂:5,的:5]

\data\L1\L2\M3\C1.txt: [，:27,的:22,」:18,「:17,。:11,中:8,柴可夫斯基:7,盛:7,能:7,余:7]

\data\L1\L2\M3\C2.txt: [，:38,的:27,舞:20,。:19,「:18,」:17,德國:11,能:11,團:11,、:10,舞蹈:10]

save and load 'invert.dat'

Input the query word:
三峽
query '三峽' occurs in articles [\data\L1\L2\E1.txt, \data\L1\L2\E2.txt, \data\L1\L2\E3.txt,\data\L1\L2\E4.txt, \data\L1\L2\E5.txt, \data\L1\L2\E6.txt, \data\L1\L2\E7.txt]

*/
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.io.IOException;

import java.util.Set;
import java.util.Map;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Scanner;

import com.chenlb.mmseg4j.example.Simple;

public class Indexer
{
  private static Simple segmenter;
  private static HashMap<String, ArrayList<String>> invertList;


  public static void statArticle(File file, HashMap<String, Integer> frequency)
    throws FileNotFoundException,UnsupportedEncodingException,IOException
  {
    FileInputStream fis = new FileInputStream(file);  // FileNotFoundException
    InputStreamReader isr = new InputStreamReader(fis, "big5"); //
    BufferedReader br = new BufferedReader(isr);

    // fill hashmap frequency

    while(br.ready()==true)  // IOException
    {
      String text = br.readLine();
      text = text.replaceAll(" ", "");  // remove interference from blanks
      String seg_text = segmenter.segWords(text, " ");
        //System.out.println(text);
       //System.out.println(seg_text);
      String words[] = seg_text.split(" ");

      for(String w : words)
      {
        //System.out.println(w.length() + ":" + w);
        if(w.length()==0) continue;

        if(frequency.containsKey(w)==true)
        {
          int count = frequency.get(w);
          count++;
          frequency.put(w,count);
        }
        else
        {
          frequency.put(w,1);
        }
      }
    }
    br.close();

    // process hashmap frequency

    int numberWords = frequency.size();
/*
    Collection<Integer> counts = frequency.values();
    List<Integer> counts_list = new ArrayList<>(counts);
    Collections.sort(counts_list);

    int count_threshold = counts_list.get(counts_list.size() - 10);

    String fullName = file.getCanonicalPath();
    System.out.printf("%s: [", fullName);
    for(String word : frequency.keySet())
    {
      int count = frequency.get(word);

      if(count >= count_threshold)
      {
        System.out.printf(",%s:%d", word, count);
      }
    }
    System.out.printf("]\n\n");
*/
    Set<Map.Entry<String,Integer>> entries = frequency.entrySet();
    List<Map.Entry<String,Integer>> entryList = new ArrayList<Map.Entry<String,Integer>>(entries);
    Comparator<Map.Entry<String,Integer>> cmp = new Comparator<Map.Entry<String,Integer>>() {
     public int compare(Map.Entry<String,Integer> e1, Map.Entry<String,Integer> e2)
     {
       return e2.getValue() - e1.getValue();  // descending order
     }
    };
    Collections.sort(entryList, cmp);

    int count_threshold = entryList.get(9).getValue();

    String fullName = file.getCanonicalPath();
    System.out.printf("%s: [", fullName);
    boolean first=true;
    for(Map.Entry<String,Integer> e : entryList)
    {
      if(e.getValue() >= count_threshold)
      {
        System.out.printf("%s%s:%d", (first==false)?",":"", e.getKey(), e.getValue());
       if(first==true) first = false;
      }
    }
    System.out.printf("]\n\n");
  }


  public static void indexArticleWords(File dirFile,
    HashMap<String, ArrayList<String>> invertList)
    throws FileNotFoundException,UnsupportedEncodingException,IOException
  {

    File list[] = dirFile.listFiles();
    String fullName;

    for(File f : list)
    {
      fullName = f.getCanonicalPath(); // throws IOException

      if(f.isFile()==true)
      {
       HashMap<String, Integer> frequency
         = new HashMap<String, Integer>();

        statArticle(f, frequency);

        for(String word : frequency.keySet())
        {
          if(invertList.containsKey(word)==true)
          {
            ArrayList<String> oldList = invertList.get(word);
            oldList.add(fullName);
          }
          else
          {
            ArrayList<String> newList = new ArrayList<>();
            newList.add(fullName);
            invertList.put(word, newList);
          }
        }
      }

      else if(f.isDirectory()==true)
      {
       indexArticleWords(f, invertList);
      }
    }
  }



  @SuppressWarnings("unchecked")
  public static void main(String args[])
    throws FileNotFoundException,UnsupportedEncodingException,
           IOException,ClassNotFoundException
  {
     // set up root folder
     String rootName = "data/";
     String testName = "data/L1/F1.txt";
     File root;

     if(args.length==1)
     {
       rootName = args[0];
     }

     root = new File(rootName);

     // set up segmenter and hashmap frequency
     segmenter = new Simple();

     HashMap<String, Integer> frequency = new HashMap<>();

     statArticle(new File(testName), frequency);  // 測試文章斷詞

     if(new File("invert.dat").exists()==false)
     {
       // set up invertList
       invertList = new HashMap<String, ArrayList<String>>();

       indexArticleWords(root, invertList);

       //save invertList to file 'invert.dat' and load from file 'invert.dat'

       ObjectOutputStream oos = new ObjectOutputStream(
       new FileOutputStream("invert.dat"));
       oos.writeObject(invertList);
       oos.close();
     }


     ObjectInputStream ois = new ObjectInputStream(
      new FileInputStream("invert.dat"));
     HashMap<String, ArrayList<String>>
       invertList2 = (HashMap<String, ArrayList<String>>) ois.readObject();
     ois.close();

     // do the query test on the loaded invertList
     System.out.println("save and load 'invert.dat'\n");

     System.out.print("Input the query word: ");
     Scanner sc = new Scanner(System.in);
     String query = sc.next();

     //print the file paths which have the query word
     ArrayList<String> list = invertList2.get(query);
     System.out.printf("query '%s' occurs in articles %s\n", query, list);

  }
}

an example to use the modbus rtu protocol to query/read/write the motor drivers

Modbus應用層協定常作為工業設備之間的溝通語言，
RS485實體層協定則常作為電腦及設備之間較長距離、耐雜訊的溝通工具。
因此，若想由電腦對馬達設備進行控制，常見利用
Modbus及RS485建立電腦和馬達驅動器之間的連線。

電腦和馬達驅動器之間的溝通關係如下:

電腦                              馬達驅動器
------                            --------------
客戶應用程式
MODBUS應用層<--------------------->MODBUS應用層
COM1序列埠
RS485實體層<---------------------->RS485實體層

電腦端應用層對實體層的介面就是COM1序列埠。
電腦及馬達兩者間的應用層溝通就靠MODBUS RTU協定。
MODBUS RTU (remote terminal unit)協定可針對特定id的馬達驅動器，
利用序列埠發出指令，讀寫不同暫存器，達成驅動馬達運轉的目的。

以下將以東方馬達的 AZ 系列驅動器，其支援的modbus指令為例，
介紹其利用modbus rtu協定讀、寫、查詢驅動器的溝通過程，
可作為撰寫序列埠相關程式之參考。
範例出自手冊 pp224-228, HM-60260C(AZ).pdf。

--------- 讀取一段連續暫存器 -----------

A.欲讀取馬達驅動器一段連續暫存器的資料:
A1.先循序對COM1寫出8 bytes如下:

id:         01h     表 馬達驅動器id
code:       03h     表 讀取一段連續暫存器 指令
start_addr: 18h,40h 表 1840h 暫存器起始住址
addr_count: 00h,06h 表 0006h 連續暫存器個數
crc:        c2h,bch 表驗證碼 bcc2h

表示對馬達驅動器id=01h，下03h號指令讀取一段連續暫存器，
回傳起始位址1840h，連續0006h個16bit暫存器的內容

A2.再從COM1循序接收結果如下:

01h      表馬達驅動器id，應為剛才id
03h      表讀取指令之回應結果，應為剛才code
0ch      表後續的byte個數，應為addr_count兩倍
00h, 00h 表 1840h 內容 0000h
00h, 02h 表 1841h 內容 0002h
ffh, ffh 表 1842h 內容 ffffh
d8h, f0h 表 1843h 內容 d8f0h
00h, 00h,表 1844h 內容 0000h
27h, 10h,表 1845h 內容 2710h
82h, eah 表 驗證碼ea82h



--------- 寫入一段連續暫存器 -----------

B.欲寫入資料到馬達驅動器的一段連續暫存器:
B1.先循序對COM1寫出21 bytes如下:

id:         04h     表 馬達驅動器id
code:       10h     表 寫入一段連續暫存器 指令
start_addr: 18h,c6h 表 18c6h 暫存器起始住址
addr_count: 00h,06h 表 0006h 連續暫存器個數
byte_count: 0ch     表後續的byte個數，應為addr_count兩倍
data:
  00h,00h           表 18c6h 內容 0000h
  27h,10h           表 18c7h 內容 2710h
  00h,00h           表 18c8h 內容 0000h
  4eh,20h           表 18c9h 內容 4e20h
  00h,00h,          表 1844h 內容 0000h
  01h,f4h,          表 1845h 內容 01f4h
crc:        6ch,a0h 表 驗證碼a06ch

表示對馬達驅動器id=04h，下10h號指令寫入一段連續暫存器，
將後續0ch個byte，依序寫入起始位址18c6h，連續0006h個16bit暫存器

B2.再從COM1接收回應結果如下

04h     表馬達驅動器id，應為剛才id
10h     表讀取指令之回應結果，應為剛才code
18h,c6h 表 18c6h 起始住址，應為剛才start_addr
00h,06h 表 0006h 連續住址個數，應為剛才addr_count
a6h,c3h 表 驗證碼c3a6h



--------- 寫入單一暫存器 -----------

C.欲寫出資料到馬達驅動器的單一暫存器:
C1.先循序對COM1寫出8 bytes如下:

id:    02h     表 馬達驅動器id
code:  06h     表 寫入單一暫存器 指令
addr:  02h,55h 表寫入住址0255h
data:  00h,50h 表寫入內容0050h
crc:   98h,6dh 表驗證碼6d98h

表示對馬達驅動器id=02h，下06h號指令寫入單一暫存器，
將data=0050h，寫入addr=0255h的16bit暫存器

C2.再從COM1接收回應結果如下:

02h 表馬達驅動器id，應為剛才id
06h 表讀取指令之回應結果，應為剛才code
02h,55h 表 0255h 暫存器住址，應為剛才addr
00h,50h 表 0006h 寫入內容，應為剛才data
98h, 6dh 表 驗證碼6d98h



--------- 診斷驅動器 -----------

D.欲診斷馬達驅動器:
D1.先循序對COM1寫出8 bytes如下:

id:          03h     表 馬達驅動器id
code:        08h     表 診斷 指令
subcode:     00h,00h 表子功能0000h
data:        12h,34h 表任意測試資料1234h
crc:         ech,9eh 表驗證碼9eech

表示對馬達驅動器id=03h，下08h號指令進行0000h號子功能診斷，
將隨意資料data=1234h送出，看會不會回傳該資料回來

D2.再從COM1接收回應結果如下: 應和剛才送出內容完全一樣

id:          03h     表 馬達驅動器id
code:        08h     表 診斷 指令
subcode:     00h,00h 表子功能0000h
data:        12h,34h 表任意測試資料1234h
crc:         ech,9eh 表驗證碼9eech

-

幾種環境的 Serial Port 寫法:

Linux/Cygwin C寫法:
  https://www.cmrr.umn.edu/~strupp/serial.html
  http://www.teuniz.net/RS-232/

Windows Win32 C寫法:
   http://cboard.cprogramming.com/windows-programming/141173-windows-serial-programming.html

Windows C#/C++/VB .NET寫法:
   https://msdn.microsoft.com/zh-tw/library/system.io.ports.serialport(v=vs.110).aspx

SerialPort存取原理
  http://www.dotblogs.com.tw/billchung/category/5702.aspx

PS:
https://en.wikipedia.org/wiki/Modbus
http://www.modbus.org/tech.php 標準文件及多種平台之程式碼

how to extract hand drawn figures from photos

當你用手機拍照手畫圖案時，常常會因為採光關係，出現深淺不一色調，如左圖所示。這時若用 Adobe Photoshop Express 之類的影像處理軟體，其提供的對比調整或門檻值調整 (threshold adjustment) 工具，很難單獨萃取手畫圖案出來。

若會 Python 程式，建議改用 OpenCV 套件提供的 adaptiveThreshold 方法，可自動調整臨界值，萃取手畫圖案。經適當調整參數，發現效果不錯，如上圖所示。其程式寫法如下:

#
# 去處影像 destin = adaptiveThreshold(src 來源影像, 
#    maxValue 像素最大值, 
#    adaptiveMethod 門檻值調適計算法,
#       ADAPTIVE_THRESH_MEAN_C | ADAPTIVE_THRESH_GAUSSIAN_C
#    thresholdType 門檻值套用法,
#       THRESH_BINARY | THRESH_BINARY_INV | THRESH_TRUNC |
#       THRESH_TOZERO | THRESH_TOZERO_INV
#    blockSize 鄰居方塊邊長，像素為單位, 
#    C 常數值，供鄰居方塊內的像素加權和 減掉 常數值 當成門檻值)
#
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

input = 'c:/path/source.jpg'
output = 'c:/path/destin.jpg'

# 以灰階模式(0)，讀入影像
img = cv2.imread(input,0)

# 進行中位數模糊化影像處理，將每一像素由其5x5鄰居值的中位數取代
# 可降低噪訊，保留形狀邊界
img = cv2.medianBlur(img,5)

# 依據調適性門檻，進行二值化影像處理
newimg = cv2.adaptiveThreshold(
  img, # 來源影像(灰階圖)
  255, # 像素最大值
  cv2.ADAPTIVE_THRESH_GAUSSIAN_C, # 計算鄰居高斯加權和，減常數C，當門檻值
  cv2.THRESH_BINARY, # 2值化門檻套用法，高於門檻值為255(白)，否則為0(黑) 
  11, # 鄰居方塊邊長有多少像素個數 
  10) # 常數值，試誤找出合適值

# 寫出二值化處理結果
cv2.imwrite(output, newimg)

weka.classifiers.functions.Winnow

weka.classifiers.functions.Winnow 屬錯誤驅動型學習器，
只處理文字屬性，將之轉成二元屬性，用來預測二元類別值。可線上累進學習。
適用在案例集屬性眾多，卻多數和預測不相關情況，可以快速鎖定相關屬性作預測。

給定案例屬性(a0, a1, ..., ak)，門檻 theta，權重升級係數alpha，權重降級係數beta，
權重向量(w0, w1, ..., wk)或(w0+ - w0-, w1+ - w1-, ..., wk+ - wk-)
其中，所有符號皆為正數，擴充屬性 a0 恆為 1。則預測式有二: 

  不平衡版: 權重向量各維度只能正數
     w0 * a0 + w1 * a1 + ... + wk * ak > theta 表類別1; 否則類別2

  平衡版:  權重向量各維度允許負數
    (w0+ - w0-) * a0 + (w1+ - w1-) * a1 + ... + (wk+ - wk-) * ak > theta 表類別1; 否則類別2

學習過程若遇預測錯誤，則權重向量調整法如下:
  類別2誤為類別1:   w *= beta  或 w+ *= beta  and w- *= alpha 讓權重變小
  類別1誤為類別2:   w *= alpha 或 w+ *= alpha and w- *= beta  讓權重變大

參數說明:
 -L  使用平衡版。預設值false
 -I  套用訓練集學習權重的輪數。預設值1
 -A  權重升級係數alpha，需>1。預設值2.0
 -B  權重降級係數beta，需<1。預設值0.5
 -H  預測門檻theta。預設值-1，表示屬性個數
 -W  權重初始值，需>0。預設值2.0
 -S  亂數種子，影響訓練集的案例訓練順序。預設值1


> java  weka.classifiers.functions.Winnow  -t data\weather.nominal.arff


Winnow

Attribute weights

w0 8.0
w1 1.0
w2 2.0
w3 4.0
w4 2.0
w5 2.0
w6 1.0
w7 1.0

Cumulated mistake count: 7


Time taken to build model: 0 seconds
Time taken to test model on training data: 0 seconds

=== Error on training data ===

Correctly Classified Instances          10               71.4286 %
Incorrectly Classified Instances         4               28.5714 %
Kappa statistic                          0.3778
Mean absolute error                      0.2857
Root mean squared error                  0.5345
Relative absolute error                 61.5385 %
Root relative squared error            111.4773 %
Total Number of Instances               14     


=== Confusion Matrix ===

 a b   <-- classified as
 7 2 | a = yes
 2 3 | b = no



=== Stratified cross-validation ===

Correctly Classified Instances           7               50      %
Incorrectly Classified Instances         7               50      %
Kappa statistic                         -0.2564
Mean absolute error                      0.5   
Root mean squared error                  0.7071
Relative absolute error                105      %
Root relative squared error            143.3236 %
Total Number of Instances               14     


=== Confusion Matrix ===

 a b   <-- classified as
 7 2 | a = yes
 5 0 | b = no

如下 weather.nominal.arff 案例集的14個案例利用4個文字屬性，預測文字屬性。

 
 
 
 
 

  
 
 
 
 
 

  outlook
  temperature
  humidity
  windy
  play
 

  sunny
  hot
  high
  FALSE
  no
 

  sunny
  hot
  high
  TRUE
  no
 

  overcast
  hot
  high
  FALSE
  yes
 

  rainy
  mild
  high
  FALSE
  yes
 

  rainy
  cool
  normal
  FALSE
  yes
 

  rainy
  cool
  normal
  TRUE
  no
 

  overcast
  cool
  normal
  TRUE
  yes
 

  sunny
  mild
  high
  FALSE
  no
 

  sunny
  cool
  normal
  FALSE
  yes
 

  rainy
  mild
  normal
  FALSE
  yes
 

  sunny
  mild
  normal
  TRUE
  yes
 

  overcast
  mild
  high
  TRUE
  yes
 

  overcast
  hot
  normal
  FALSE
  yes
 

  rainy
  mild
  high
  TRUE
  no




參考:
1.weka.classifiers.functions.Winnow
   code | doc

weka.classifiers.functions.VotedPerceptron

weka.classifiers.functions.VotedPerceptron 為投票型感知器，屬錯誤驅動型學習器。
先全域性取代缺值，再轉換文字屬性為二元屬性，適用於預測二元類別值，可線上累進學習。

給定案例屬性 a=(a0, a1, ..., ak)，權重向量 w=(w0, w1, ..., wk)
其中，a 屬性值為二元值 0 或 1，擴充屬性 a0 恆為 1。
預測式為
  w0 * a0 + w1 * a1 + ... + wk * ak > 0 表類別1; 否則類別2

學習過程若遇預測錯誤，則權重向量調整法如下:
  類別2誤為類別1:   w -= a  讓權重變小
  類別1誤為類別2:   w += a  讓權重變大

參數說明:
 -I  套用訓練集學習權重的輪數。預設值1
 -E  多項式核函數(polynomial kernel)之次方。預設值1
 -S  亂數種子，影響訓練集的案例訓練順序。預設值1
 -M  最大允許權重修正次數。預設值10000

> java  weka.classifiers.functions.VotedPerceptron  -t data\weather.numeric.arff


VotedPerceptron: Number of perceptrons=5


Time taken to build model: 0 seconds
Time taken to test model on training data: 0 seconds

=== Error on training data ===

Correctly Classified Instances           9               64.2857 %
Incorrectly Classified Instances         5               35.7143 %
Kappa statistic                          0     
Mean absolute error                      0.3623
Root mean squared error                  0.587 
Relative absolute error                 78.0299 %
Root relative squared error            122.4306 %
Total Number of Instances               14     


=== Confusion Matrix ===

 a b   <-- classified as
 9 0 | a = yes
 5 0 | b = no



=== Stratified cross-validation ===

Correctly Classified Instances           9               64.2857 %
Incorrectly Classified Instances         5               35.7143 %
Kappa statistic                          0     
Mean absolute error                      0.3736
Root mean squared error                  0.589 
Relative absolute error                 78.4565 %
Root relative squared error            119.3809 %
Total Number of Instances               14     


=== Confusion Matrix ===

 a b   <-- classified as
 9 0 | a = yes
 5 0 | b = no


如下 weather.numeric.arff 案例集的14個案例利用2個文字屬性及2個數字屬性，預測文字屬性。

outlook temperature humidity windy play
sunny 85 85 FALSE no
sunny 80 90 TRUE no
rainy 65 70 TRUE no
sunny 72 95 FALSE no
rainy 71 91 TRUE no
overcast 83 86 FALSE yes
rainy 70 96 FALSE yes
rainy 68 80 FALSE yes
overcast 64 65 TRUE yes
sunny 69 70 FALSE yes
rainy 75 80 FALSE yes
sunny 75 70 TRUE yes
overcast 72 90 TRUE yes
overcast 81 75 FALSE yes


參考:
1.weka.classifiers.functions.VotedPerceptron
   code | doc

weka.classifiers.functions.Logistic

weka.classifiers.functions.Logistic 為羅吉斯迴歸學習器，
建立多類別羅吉斯迴歸模型，含嶺迴歸估計量(ridge estimator)參數，可用來預測類別值。
缺值由ReplaceMissingValuesFilter過濾器補值，文字屬性由NominalToBinaryFilter過濾器轉為數字。
 
參數說明:
 -R <ridge> 設定log相似度的嶺迴歸估計量。預設值1e-8
 -M <number> 設定最大迭代次數。預設值 -1 表示直到收斂為止


> java  weka.classifiers.functions.Logistic  -t data\weather.numeric.arff


Logistic Regression with ridge parameter of 1.0E-8
Coefficients...
                          Class
Variable                    yes
===============================
outlook=sunny           -6.4257
outlook=overcast        13.5922
outlook=rainy           -5.6562
temperature             -0.0776
humidity                -0.1556
windy                    3.7317
Intercept                22.234


Odds Ratios...
                          Class
Variable                    yes
===============================
outlook=sunny            0.0016
outlook=overcast    799848.4279
outlook=rainy            0.0035
temperature              0.9254
humidity                 0.8559
windy                   41.7508


Time taken to build model: 0 seconds
Time taken to test model on training data: 0 seconds

=== Error on training data ===

Correctly Classified Instances          11               78.5714 %
Incorrectly Classified Instances         3               21.4286 %
Kappa statistic                          0.5532
Mean absolute error                      0.2066
Root mean squared error                  0.3273
Relative absolute error                 44.4963 %
Root relative squared error             68.2597 %
Total Number of Instances               14     


=== Confusion Matrix ===

 a b   <-- classified as
 7 2 | a = yes
 1 4 | b = no



=== Stratified cross-validation ===

Correctly Classified Instances           8               57.1429 %
Incorrectly Classified Instances         6               42.8571 %
Kappa statistic                          0.0667
Mean absolute error                      0.4548
Root mean squared error                  0.6576
Relative absolute error                 95.5132 %
Root relative squared error            133.2951 %
Total Number of Instances               14     


=== Confusion Matrix ===

 a b   <-- classified as
 6 3 | a = yes
 3 2 | b = no


如下 weather.numeric.arff 案例集的14個案例利用2個文字屬性及2個數字屬性，預測文字屬性。

outlook temperature humidity windy play
sunny 85 85 FALSE no
sunny 80 90 TRUE no
rainy 65 70 TRUE no
sunny 72 95 FALSE no
rainy 71 91 TRUE no
overcast 83 86 FALSE yes
rainy 70 96 FALSE yes
rainy 68 80 FALSE yes
overcast 64 65 TRUE yes
sunny 69 70 FALSE yes
rainy 75 80 FALSE yes
sunny 75 70 TRUE yes
overcast 72 90 TRUE yes
overcast 81 75 FALSE yes


參考:
1.weka.classifiers.functions.Logistic
   code | doc

weka.classifiers.functions.LinearRegression

weka.classifiers.functions.LinearRegression 為標準線性迴歸學習器，
學習各數值屬性的權重，建立線性方程式模型，預測數值類別。

參數說明:
-S select_attribute_code 屬性挑選法代碼，0 表M5'，1 表無，2 表Greedy。預設值 0。


> java  weka.classifiers.functions.LinearRegression  -t data\cpu.arff


Linear Regression Model

class =

      0.0491 * MYCT +
      0.0152 * MMIN +
      0.0056 * MMAX +
      0.6298 * CACH +
      1.4599 * CHMAX +
    -56.075 


Time taken to build model: 0.02 seconds
Time taken to test model on training data: 0.02 seconds

=== Error on training data ===

Correlation coefficient                  0.93
Mean absolute error                     37.9748
Root mean squared error                 58.9899
Relative absolute error                 39.592  %
Root relative squared error             36.7663 %
Total Number of Instances              209     



=== Cross-validation ===

Correlation coefficient                  0.9012
Mean absolute error                     41.0886
Root mean squared error                 69.556 
Relative absolute error                 42.6943 %
Root relative squared error             43.2421 %
Total Number of Instances              209     


cpu.arff 資料集有209案例，每個案例由6個數值屬性預測1個數值屬性。


 

  MYCT
  MMIN
  MMAX
  CACH
  CHMIN
  CHMAX
  class
 

  125
  256
  6000
  256
  16
  128
  198
 

  29
  8000
  32000
  32
  8
  32
  269
 

  29
  8000
  32000
  32
  8
  32
  220
 

  29
  8000
  32000
  32
  8
  32
  172
 

  29
  8000
  16000
  32
  8
  16
  132
 

  26
  8000
  32000
  64
  8
  32
  318
 

  23
  16000
  32000
  64
  16
  32
  367
 

  23
  16000
  32000
  64
  16
  32
  489
 

  23
  16000
  64000
  64
  16
  32
  636
 

  .....
  

  

  

  

  

  

 


參考:
1.weka.classifiers.functions.LinearRegression
   code | doc

weka.classifiers.functions.SimpleLinearRegression

weka.classifiers.functions.SimpleLinearRegression 為簡單線性迴歸學習器，
簡單指的是只挑一個平方誤差最小的屬性作線性預測。
只適用數值對數值的預測，不接受缺值案例。

> java  weka.classifiers.functions.SimpleLinearRegression  -t data\cpu.arff


Linear regression on MMAX

0.01 * MMAX - 34


Time taken to build model: 0 seconds
Time taken to test model on training data: 0.02 seconds

=== Error on training data ===

Correlation coefficient                  0.863
Mean absolute error                     50.8658
Root mean squared error                 81.0566
Relative absolute error                 53.0319 %
Root relative squared error             50.5197 %
Total Number of Instances              209     



=== Cross-validation ===

Correlation coefficient                  0.7844
Mean absolute error                     53.8054
Root mean squared error                 99.5674
Relative absolute error                 55.908  %
Root relative squared error             61.8997 %
Total Number of Instances              209    

cpu.arff 資料集有209案例，每個案例由6個數值屬性預測1個數值屬性。


 

  MYCT
  MMIN
  MMAX
  CACH
  CHMIN
  CHMAX
  class
 

  125
  256
  6000
  256
  16
  128
  198
 

  29
  8000
  32000
  32
  8
  32
  269
 

  29
  8000
  32000
  32
  8
  32
  220
 

  29
  8000
  32000
  32
  8
  32
  172
 

  29
  8000
  16000
  32
  8
  16
  132
 

  26
  8000
  32000
  64
  8
  32
  318
 

  23
  16000
  32000
  64
  16
  32
  367
 

  23
  16000
  32000
  64
  16
  32
  489
 

  23
  16000
  64000
  64
  16
  32
  636
 

  .....
  

  

  

  

  

  

 


參考:
1.weka.classifiers.functions.SimpleLinearRegression
   code | doc

訂閱：文章 (Atom)

Seke Blog