blog_20160720_1_6333785 80行 Java
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
package edu.zju.cst.krselee.example.word2vector;

/**
* Created by KrseLee on 16/7/20.
*/

import org.fnlp.nlp.cn.tag.CWSTagger;
import org.fnlp.util.exception.LoadModelException;

import java.io.IOException;
import java.util.List;

import org.fnlp.ml.types.Dictionary;
import org.fnlp.nlp.corpus.StopWords;

public class FudanTokenizer {

private CWSTagger tag;

private StopWords stopWords;

public FudanTokenizer() {
String path = this.getClass().getClassLoader().getResource("").getPath();
System.out.println(path);
try {
tag = new CWSTagger(path + "models/seg.m");
} catch (LoadModelException e) {
e.printStackTrace();
}

}

public String processSentence(String context) {
String s = tag.tag(context);
return s;
}

public String processSentence(String sentence, boolean english) {
if (english) {
tag.setEnFilter(true);
}
return tag.tag(sentence);
}

public String processFile(String filename) {
String result = tag.tagFile(filename);

return result;
}

/**
* 设置分词词典
*/
public boolean setDictionary() {
String dictPath = this.getClass().getClassLoader().getResource("models/dict.txt").getPath();

Dictionary dict = null;
try {
dict = new Dictionary(dictPath);
} catch (IOException e) {
return false;
}
tag.setDictionary(dict);
return true;
}

/**
* 去除停用词
*/
public List<String> flitStopWords(String[] words) {
try {
List<String> baseWords = stopWords.phraseDel(words);
return baseWords;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
blog_20160720_2_9125448 11行 html/xml
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
<dependency>
<groupId>org.fnlp</groupId>
<artifactId>fnlp-core</artifactId>
<version>2.1-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
blog_20160720_3_5363457 22行 Java
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
public void processFile() throws Exception{
String filePath = this.getClass().getClassLoader().getResource("text/tlbb.txt").getPath();
BufferedReader in = new BufferedReader(new FileReader(filePath));

File outfile = new File("/Users/KrseLee/dataset/tlbb_t.txt");
if (outfile.exists()) {
outfile.delete();
}
FileOutputStream fop = new FileOutputStream(outfile);

// 构建FileOutputStream对象,文件不存在会自动新建
String line = in.readLine();
OutputStreamWriter writer = new OutputStreamWriter(fop, "UTF-8");
while(line!=null) {
line = tokenizer.processSentence(line);
writer.append(line);
line = in.readLine();
}
in.close();
writer.close(); // 关闭写入流,同时会把缓冲区内容写入文件
fop.close(); // 关闭输出流,释放系统资源
}
blog_20160720_4_3204649 55行 Java
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
package edu.zju.cst.krselee.example.word2vector;

import org.canova.api.util.ClassPathResource;
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;

/**
* Created by KrseLee on 16/7/20.
*/
public class ZhWord2Vector {
private static Logger log = LoggerFactory.getLogger(ZhWord2Vector.class);

public static void main(String[] args) throws Exception {

String filePath = new ClassPathResource("text/tlbb_t.txt").getFile().getAbsolutePath();

log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words

log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(1)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(iter)
.build();

log.info("Fitting Word2Vec model....");
vec.fit();

log.info("Writing word vectors to text file....");

// Write word vectors
WordVectorSerializer.writeWordVectors(vec, "tlbb_vectors.txt");
WordVectorSerializer.writeFullModel(vec,"tlbb_model.txt");
String[] names = {"萧峰","乔峰","段誉","虚竹","王语嫣","阿紫","阿朱","木婉清"};
log.info("Closest Words:");

for(String name:names) {
System.out.println(name+">>>>>>");
Collection<String> lst = vec.wordsNearest(name, 10);
System.out.println(lst);
}
}
}
blog_20160720_5_5298200 16行 Text
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
萧峰>>>>>>
[段誉, 叫骂, 一队队, 军官, 将, 狗子, 长矛, 指挥, 说, 传令]
乔峰>>>>>>
[南, 大侠, 北, 大英雄, 四海, 厮, 听说, 奸谋, 威震, 全舵]
段誉>>>>>>
[萧峰, 虚竹, 向, 玄渡, 等, 叫骂, 去, 辽兵, 一边, 城门]
虚竹>>>>>>
[段誉, 向西, 萧峰, 向, 城门, 叫骂, 等, 辽兵, 玄鸣, 去]
王语嫣>>>>>>
[巴天石, 钟灵, 木婉清, 草海, 朱丹臣, 老婆婆, 瘴气, 贾老者, 嗒嗒嗒, 途中]
阿紫>>>>>>
[道, 穆贵妃, 抿嘴笑, 姊夫, 来, 叫, 又, 小嘴, 大人, 什么]
阿朱>>>>>>
[深情, 想起, 换上, 父母, 想念, 恩情, 胡作非为, 迫, 情意, 永远]
木婉清>>>>>>
[钟灵, 朱丹臣, 巴天石, 秦红棉, 范骅, 一行人, 王语嫣, 墙外, 阮星竹, 巴天]