哈工大分词接口稍微修改点

 package IRdll;

import java.io.File;
import java.io.Reader;
import java.io.FileInputStream;
import java.io.*;
import java.util.Date;
import java.util.HashSet;

/**
 * <p>Title: Java中文分词接口</p>
 * <p>Description: 本组件以哈工大分词系统为基础,在其基础之上开发
 * 本组件仅供学习和研究用途,任何商业用途将自行承担法律后果,与组件编写人无关。</p>
 * <p>Copyright: Copyright (c) 2006</p>
 * <p>Company: dalian univercity of techology</p>
 * @author :yezheng
 * @version 1.0
 */

public class IRSplit {

  private static IRSplit instance = null; //instance时类中一成员,所以可以访问其中被private修饰的变量或方法
  private static StringBuffer stringb = new StringBuffer(102400);
  private static StringBuffer longSentence = new StringBuffer(10240);

  //私有构造方法
  private IRSplit() {
    System.out.println("正在加载词典……");
    this.LoadSegRes();
    System.out.println("加载结束");
  }

  //获得一个实例
  public static IRSplit getInstance() {
    if (instance == null) {
      instance = new IRSplit();
    }
    return instance;
  }

//本地方法
  private native void LoadSegRes();

  private native void ReleaseSegger();

  private native String split(String sentence);

//对一句话分词
  public String splitSentence(String sentence) {
    if (sentence.length() < 1 || sentence == null)
      return "";
    else
      return split(sentence);
  }

  public String splitLongSentence(String sentence) {
    if (sentence.length() < 1 || sentence == null)
      return "";
    else
    {
      this.longSentence.setLength(0);
      int start = 0 ; int  end = 0;
      for (int i = 0; i < sentence.length(); i++)
      {
        char c = sentence.charAt(i);
        switch (Character.getType(c))
        {
          case 24:
            end++;
            //System.out.println(sentence.substring(start, end));
            this.longSentence.append(split(sentence.substring(start, end)));
            start = end;
            break;
          default:
            end++;
            break;
        }
      }
      if(start < end)
      {
        longSentence.append(split(sentence.substring(start, end)));
      }
      return longSentence.toString();
    }
  }

  public void ReleaseSeggers() {
    instance = null;
    ReleaseSegger();
  }

//对一个文件分词
  public void splitFile(File file, File outfile) {
    try {
      FileInputStream fis = new FileInputStream(file);
      BufferedReader br = new BufferedReader(new InputStreamReader(fis));
      this.stringb.setLength(0);
      String ts;
      while ( (ts = br.readLine()) != null) {
        if (ts.length() != 0) {
          stringb.append(splitLongSentence(ts) + '\r' + '\n'); //进行分词
        }
        else {
          stringb.append('\r');
          stringb.append('\n');
        }
      }
      br.close();
      fis.close();

      FileWriter writer = new FileWriter(outfile);
      writer.write(stringb.toString());
      writer.close();
    }
    catch (FileNotFoundException ex) {
      System.out.println(file.toString() + "File not Found");
    }
    catch (IOException ex1) {
      System.out.println(file.toString() + "IO errors");
    }
  }

  public void splitFile(String source, String destination) {
    File file = new File(source);
    File outfile = new File(destination);
    if (file.isFile()) {
      splitFile(file, outfile);
    }
  }

  public Reader splitFile(Reader reader) {
    BufferedReader br = new BufferedReader(reader);
    StringBuffer stringb = new StringBuffer();

    try {
      String ts;
      while ( (ts = br.readLine()) != null) {
        if (ts.length() != 0) {
          stringb.append(splitSentence(ts) + '\r' + '\n'); //进行分词
        }
        else {
          stringb.append('\r');
          stringb.append('\n');
        }
      }
      reader = new StringReader(stringb.toString());
    }
    catch (IOException ex) {
    }
    return reader;
  }

//处理一个目录下的所有文件
  public void splitFiles(String sourceDir, String destinationDir) { //参数:源文件目录和目标文件目录
    File directory = new File(sourceDir);
    File dirdes = new File(destinationDir);
    //FilenameFilter txtFilter = new myFilter("txt");
    File files[] = directory.listFiles();
    for (int i = 0; i < files.length; i++) {
      if (files[i].isFile()) {
        File outfile = new File(destinationDir + "/" +
                                files[i].getName());
        //System.out.println(directory.getName() + ":" + dirdes.getName());
        splitFile(files[i], outfile);
      }
      else if (files[i].isDirectory()) {

        File tempdir = new File(destinationDir + "/" + files[i].getName());
        if (!tempdir.exists() || !tempdir.isDirectory()) {
          tempdir.mkdir();
        }
        splitFiles(sourceDir + "/" + files[i].getName(),
                   tempdir.getAbsolutePath());
      }
    }
  }

  static {
    System.loadLibrary("IRdll");
  }

  public static void main(String[] args) {

    IRSplit split = IRSplit.getInstance(); //其它类使用
 
    long start = System.currentTimeMillis();
    Date startdate = new Date();

    //split.splitFiles("clean", "out");
    split.splitLongSentence(ss);

    Date enddate = new Date();

    System.out.println(startdate);
    System.out.println(enddate);
    System.out.println(enddate.getTime()- startdate.getTime());
  }

}

没有评论: