1.add ik in auto complete for better experience

main
Gary 11 months ago
parent b1d7a2963d
commit c0a7d5961b

@ -74,6 +74,16 @@
<artifactId>common-random</artifactId>
<version>1.0.21</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.10.3</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.10.3</version>
</dependency>
</dependencies>
<build>
<finalName>app</finalName>

@ -35,4 +35,7 @@ public interface SongInfoDao extends JpaRepository<SongInfo,String>, JpaSpecific
@Query(value = "select * from tb_song_info where id=?1", nativeQuery = true)
SongInfo getSongById(String id);
@Query(value = "select * from tb_song_info where state='1' ", nativeQuery = true)
List<SongInfo> findValidSongs();
}

@ -0,0 +1,79 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.cfg;
import java.util.List;
/**
*
*
*
*/
public interface Configuration {
/**
* useSmart
* useSmart =true 使 =false使
* @return useSmart
*/
public boolean useSmart();
/**
* useSmart
* useSmart =true 使 =false使
* @param useSmart
*/
public void setUseSmart(boolean useSmart);
/**
*
*
* @return String
*/
public String getMainDictionary();
/**
*
* @return String
*/
public String getQuantifierDicionary();
/**
*
* @return List<String>
*/
public List<String> getExtDictionarys();
/**
*
* @return List<String>
*/
public List<String> getExtStopWordDictionarys();
}

@ -0,0 +1,169 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package com.luoo.music.ik.cfg;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;
/**
* Configuration
* 2012-5-8
*
*/
public class DefaultConfig implements Configuration{
/*
*
*/
private static final String PATH_DIC_MAIN = "com/luoo/music/ik/dic/main2012.dic";
private static final String PATH_DIC_QUANTIFIER = "com/luoo/music/ik/dic/quantifier.dic";
/*
*
*/
private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
//配置属性——扩展字典
private static final String EXT_DICT = "ext_dict";
//配置属性——扩展停止词典
private static final String EXT_STOP = "ext_stopwords";
private Properties props;
/*
* 使smart
*/
private boolean useSmart;
/**
*
* @return Configuration
*/
public static Configuration getInstance(){
return new DefaultConfig();
}
/*
*
*/
private DefaultConfig(){
props = new Properties();
InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME);
if(input != null){
try {
props.loadFromXML(input);
} catch (InvalidPropertiesFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* useSmart
* useSmart =true 使 =false使
* @return useSmart
*/
public boolean useSmart() {
return useSmart;
}
/**
* useSmart
* useSmart =true 使 =false使
* @param useSmart
*/
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
*
*
* @return String
*/
public String getMainDictionary(){
return PATH_DIC_MAIN;
}
/**
*
* @return String
*/
public String getQuantifierDicionary(){
return PATH_DIC_QUANTIFIER;
}
/**
*
* @return List<String>
*/
public List<String> getExtDictionarys(){
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = props.getProperty(EXT_DICT);
if(extDictCfg != null){
//使用;分割多个扩展字典配置
String[] filePaths = extDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
extDictFiles.add(filePath.trim());
}
}
}
}
return extDictFiles;
}
/**
*
* @return List<String>
*/
public List<String> getExtStopWordDictionarys(){
List<String> extStopWordDictFiles = new ArrayList<String>(2);
String extStopWordDictCfg = props.getProperty(EXT_STOP);
if(extStopWordDictCfg != null){
//使用;分割多个扩展字典配置
String[] filePaths = extStopWordDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
extStopWordDictFiles.add(filePath.trim());
}
}
}
}
return extStopWordDictFiles;
}
}

@ -0,0 +1,391 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import com.luoo.music.ik.cfg.Configuration;
import com.luoo.music.ik.dic.Dictionary;
/**
*
*
*
*/
class AnalyzeContext {
//默认缓冲区大小
private static final int BUFF_SIZE = 4096;
//缓冲区耗尽的临界值
private static final int BUFF_EXHAUST_CRITICAL = 100;
//字符窜读取缓冲
private char[] segmentBuff;
//字符类型数组
private int[] charTypes;
//记录Reader内已分析的字串总长度
//在分多段分析词元时该变量累计当前的segmentBuff相对于reader起始位置的位移
private int buffOffset;
//当前缓冲区位置指针
private int cursor;
//最近一次读入的,可处理的字串长度
private int available;
//子分词器锁
//该集合非空说明有子分词器在占用segmentBuff
private Set<String> buffLocker;
//原始分词结果集合,未经歧义处理
private QuickSortSet orgLexemes;
//LexemePath位置索引表
private Map<Integer , LexemePath> pathMap;
//最终分词结果集
private LinkedList<Lexeme> results;
//分词器配置项
private Configuration cfg;
public AnalyzeContext(Configuration cfg){
this.cfg = cfg;
this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<String>();
this.orgLexemes = new QuickSortSet();
this.pathMap = new HashMap<Integer , LexemePath>();
this.results = new LinkedList<Lexeme>();
}
int getCursor(){
return this.cursor;
}
//
// void setCursor(int cursor){
// this.cursor = cursor;
// }
char[] getSegmentBuff(){
return this.segmentBuff;
}
char getCurrentChar(){
return this.segmentBuff[this.cursor];
}
int getCurrentCharType(){
return this.charTypes[this.cursor];
}
int getBufferOffset(){
return this.buffOffset;
}
/**
* contextsegmentBuff
* @param reader
* @return
* @throws IOException
*/
int fillBuffer(Reader reader) throws IOException{
int readCount = 0;
if(this.buffOffset == 0){
//首次读取reader
readCount = reader.read(segmentBuff);
}else{
int offset = this.available - this.cursor;
if(offset > 0){
//最近一次读取的>最近一次处理的将未处理的字串拷贝到segmentBuff头部
System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
readCount = offset;
}
//继续读取reader 以onceReadIn - onceAnalyzed为起始位置继续填充segmentBuff剩余的部分
readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
}
//记录最后一次从Reader中读入的可用字符长度
this.available = readCount;
//重置当前指针
this.cursor = 0;
return readCount;
}
/**
* buff
*/
void initCursor(){
this.cursor = 0;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
}
/**
* +1
* true bufffalse
*
*/
boolean moveCursor(){
if(this.cursor < this.available - 1){
this.cursor++;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
return true;
}else{
return false;
}
}
/**
* segmentBuff
* segmentBuffsegmentBuff
* @param segmenterName
*/
void lockBuffer(String segmenterName){
this.buffLocker.add(segmenterName);
}
/**
* segmentBuff
* @param segmenterName
*/
void unlockBuffer(String segmenterName){
this.buffLocker.remove(segmenterName);
}
/**
* buffLockersegmenterName
* buffer
* @return boolean
*/
boolean isBufferLocked(){
return this.buffLocker.size() > 0;
}
/**
* segmentBuff
* cursorsegmentBuffthis.available - 1
* @return
*/
boolean isBufferConsumed(){
return this.cursor == this.available - 1;
}
/**
* segmentBuff
*
*
* 1.available == BUFF_SIZE buffer
* 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL
* 3.!context.isBufferLocked()segmenterbuffer
* buffer
* @return
*/
boolean needRefillBuffer(){
return this.available == BUFF_SIZE
&& this.cursor < this.available - 1
&& this.cursor > this.available - BUFF_EXHAUST_CRITICAL
&& !this.isBufferLocked();
}
/**
* segmentBuffreader
*/
void markBufferOffset(){
this.buffOffset += this.cursor;
}
/**
*
* @param lexeme
*/
void addLexeme(Lexeme lexeme){
this.orgLexemes.addLexeme(lexeme);
}
/**
*
* --->
* @param path
*/
void addLexemePath(LexemePath path){
if(path != null){
this.pathMap.put(path.getPathBegin(), path);
}
}
/**
*
* @return
*/
QuickSortSet getOrgLexemes(){
return this.orgLexemes;
}
/**
*
* 1.buffthis.cursor
* 2.mapresults
* 3.mapCJDKresults
*/
void outputToResult(){
int index = 0;
for( ; index <= this.cursor ;){
//跳过非CJK字符
if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){
index++;
continue;
}
//从pathMap找出对应index位置的LexemePath
LexemePath path = this.pathMap.get(index);
if(path != null){
//输出LexemePath中的lexeme到results集合
Lexeme l = path.pollFirst();
while(l != null){
this.results.add(l);
//将index移至lexeme后
index = l.getBegin() + l.getLength();
l = path.pollFirst();
if(l != null){
//输出path内部词元间遗漏的单字
for(;index < l.getBegin();index++){
this.outputSingleCJK(index);
}
}
}
}else{//pathMap中找不到index对应的LexemePath
//单字输出
this.outputSingleCJK(index);
index++;
}
}
//清空当前的Map
this.pathMap.clear();
}
/**
* CJK
* @param index
*/
private void outputSingleCJK(int index){
if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){
Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR);
this.results.add(singleCharLexeme);
}else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){
Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK);
this.results.add(singleCharLexeme);
}
}
/**
* lexeme
*
*
* @return
*/
Lexeme getNextLexeme(){
//从结果集取出并移除第一个Lexme
Lexeme result = this.results.pollFirst();
while(result != null){
//数量词合并
this.compound(result);
if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
//是停止词继续取列表的下一个
result = this.results.pollFirst();
}else{
//不是停止词, 生成lexeme的词元文本,输出
result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
break;
}
}
return result;
}
/**
*
*/
void reset(){
this.buffLocker.clear();
this.orgLexemes = new QuickSortSet();
this.available =0;
this.buffOffset = 0;
this.charTypes = new int[BUFF_SIZE];
this.cursor = 0;
this.results.clear();
this.segmentBuff = new char[BUFF_SIZE];
this.pathMap.clear();
}
/**
*
*/
private void compound(Lexeme result){
if(!this.cfg.useSmart()){
return ;
}
//数量词合并处理
if(!this.results.isEmpty()){
if(Lexeme.TYPE_ARABIC == result.getLexemeType()){
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){
//合并英文数词+中文数词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
}else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
//合并英文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
if(appendOk){
//弹出
this.results.pollFirst();
}
}
//可能存在第二轮合并
if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
//合并中文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
if(appendOk){
//弹出
this.results.pollFirst();
}
}
}
}
}

@ -0,0 +1,126 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
import java.util.LinkedList;
import java.util.List;
import com.luoo.music.ik.dic.Dictionary;
import com.luoo.music.ik.dic.Hit;
/**
* -
*/
class CJKSegmenter implements ISegmenter {
//子分词器标签
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
//待处理的分词hit队列
private List<Hit> tmpHits;
CJKSegmenter(){
this.tmpHits = new LinkedList<Hit>();
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
*/
public void analyze(AnalyzeContext context) {
if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
//优先处理tmpHits中的hit
if(!this.tmpHits.isEmpty()){
//处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.tmpHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.tmpHits.remove(hit);
}
}
}
//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为词前缀
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else{
//遇到CHAR_USELESS字符
//清空队列
this.tmpHits.clear();
}
//判断缓冲区是否已经读完
if(context.isBufferConsumed()){
//清空队列
this.tmpHits.clear();
}
//判断是否锁定缓冲区
if(this.tmpHits.size() == 0){
context.unlockBuffer(SEGMENTER_NAME);
}else{
context.lockBuffer(SEGMENTER_NAME);
}
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#reset()
*/
public void reset() {
//清空队列
this.tmpHits.clear();
}
}

@ -0,0 +1,242 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.luoo.music.ik.dic.Dictionary;
import com.luoo.music.ik.dic.Hit;
/**
*
*
*/
class CN_QuantifierSegmenter implements ISegmenter{
//子分词器标签
static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
//中文数词
private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum
private static Set<Character> ChnNumberChars = new HashSet<Character>();
static{
char[] ca = Chn_Num.toCharArray();
for(char nChar : ca){
ChnNumberChars.add(nChar);
}
}
/*
*
*
* start > -1
*/
private int nStart;
/*
*
* end
*/
private int nEnd;
//待处理的量词hit队列
private List<Hit> countHits;
CN_QuantifierSegmenter(){
nStart = -1;
nEnd = -1;
this.countHits = new LinkedList<Hit>();
}
/**
*
*/
public void analyze(AnalyzeContext context) {
//处理中文数词
this.processCNumber(context);
//处理中文量词
this.processCount(context);
//判断是否锁定缓冲区
if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
//对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
}else{
context.lockBuffer(SEGMENTER_NAME);
}
}
/**
*
*/
public void reset() {
nStart = -1;
nEnd = -1;
countHits.clear();
}
/**
*
*/
private void processCNumber(AnalyzeContext context){
if(nStart == -1 && nEnd == -1){//初始状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的起始、结束位置
nStart = context.getCursor();
nEnd = context.getCursor();
}
}else{//正在处理状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的结束位置
nEnd = context.getCursor();
}else{
//输出数词
this.outputNumLexeme(context);
//重置头尾指针
nStart = -1;
nEnd = -1;
}
}
//缓冲区已经用完,还有尚未输出的数词
if(context.isBufferConsumed()){
if(nStart != -1 && nEnd != -1){
//输出数词
outputNumLexeme(context);
//重置头尾指针
nStart = -1;
nEnd = -1;
}
}
}
/**
*
* @param context
*/
private void processCount(AnalyzeContext context){
// 判断是否需要启动量词扫描
if(!this.needCountScan(context)){
return;
}
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
//优先处理countHits中的hit
if(!this.countHits.isEmpty()){
//处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.countHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.countHits.remove(hit);
}
}
}
//*********************************
//对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为量词前缀
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else{
//输入的不是中文字符
//清空未成形的量词
this.countHits.clear();
}
//缓冲区数据已经读完,还有尚未输出的量词
if(context.isBufferConsumed()){
//清空未成形的量词
this.countHits.clear();
}
}
/**
*
* @return
*/
private boolean needCountScan(AnalyzeContext context){
if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
//正在处理中文数词,或者正在处理量词
return true;
}else{
//找到一个相邻的数词
if(!context.getOrgLexemes().isEmpty()){
Lexeme l = context.getOrgLexemes().peekLast();
if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
if(l.getBegin() + l.getLength() == context.getCursor()){
return true;
}
}
}
}
return false;
}
/**
*
* @param context
*/
private void outputNumLexeme(AnalyzeContext context){
if(nStart > -1 && nEnd > -1){
//输出数词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
context.addLexeme(newLexeme);
}
}
}

@ -0,0 +1,102 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package com.luoo.music.ik.core;
/**
*
*
*/
class CharacterUtil {
public static final int CHAR_USELESS = 0;
public static final int CHAR_ARABIC = 0X00000001;
public static final int CHAR_ENGLISH = 0X00000002;
public static final int CHAR_CHINESE = 0X00000004;
public static final int CHAR_OTHER_CJK = 0X00000008;
/**
*
* @param input
* @return int CharacterUtil
*/
static int identifyCharType(char input){
if(input >= '0' && input <= '9'){
return CHAR_ARABIC;
}else if((input >= 'a' && input <= 'z')
|| (input >= 'A' && input <= 'Z')){
return CHAR_ENGLISH;
}else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
//目前已知的中文字符UTF-8集合
return CHAR_CHINESE;
}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
//韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
//日文字符集
|| ub == Character.UnicodeBlock.HIRAGANA //平假名
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
return CHAR_OTHER_CJK;
}
}
//其他的不做处理的字符
return CHAR_USELESS;
}
/**
*
* @param input
* @return char
*/
static char regularize(char input){
if (input == 12288) {
input = (char) 32;
}else if (input > 65280 && input < 65375) {
input = (char) (input - 65248);
}else if (input >= 'A' && input <= 'Z') {
//input += 32;
}
return input;
}
}

@ -0,0 +1,153 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
import java.util.Stack;
import java.util.TreeSet;
/**
* IK
*/
class IKArbitrator {
IKArbitrator(){
}
/**
*
* @param orgLexemes
* @param useSmart
*/
void process(AnalyzeContext context , boolean useSmart){
QuickSortSet orgLexemes = context.getOrgLexemes();
Lexeme orgLexeme = orgLexemes.pollFirst();
LexemePath crossPath = new LexemePath();
while(orgLexeme != null){
if(!crossPath.addCrossLexeme(orgLexeme)){
//找到与crossPath不相交的下一个crossPath
if(crossPath.size() == 1 || !useSmart){
//crossPath没有歧义 或者 不做歧义处理
//直接输出当前crossPath
context.addLexemePath(crossPath);
}else{
//对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
//输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
//把orgLexeme加入新的crossPath中
crossPath = new LexemePath();
crossPath.addCrossLexeme(orgLexeme);
}
orgLexeme = orgLexemes.pollFirst();
}
//处理最后的path
if(crossPath.size() == 1 || !useSmart){
//crossPath没有歧义 或者 不做歧义处理
//直接输出当前crossPath
context.addLexemePath(crossPath);
}else{
//对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
//输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
}
/**
*
* @param lexemeCell
* @param fullTextLength
* @param option
* @return
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
//候选路径集合
TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
//候选结果路径
LexemePath option = new LexemePath();
//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell , option);
//当前词元链并非最理想的,加入候选路径集合
pathOptions.add(option.copy());
//存在歧义词,处理
QuickSortSet.Cell c = null;
while(!lexemeStack.isEmpty()){
c = lexemeStack.pop();
//回滚词元链
this.backPath(c.getLexeme() , option);
//从歧义词位置开始,递归,生成可选方案
this.forwardPath(c , option);
pathOptions.add(option.copy());
}
//返回集合中的最优方案
return pathOptions.first();
}
/**
*
* @param LexemePath path
* @return
*/
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
//发生冲突的Lexeme栈
Stack<QuickSortSet.Cell> conflictStack = new Stack<QuickSortSet.Cell>();
QuickSortSet.Cell c = lexemeCell;
//迭代遍历Lexeme链表
while(c != null && c.getLexeme() != null){
if(!option.addNotCrossLexeme(c.getLexeme())){
//词元交叉添加失败则加入lexemeStack栈
conflictStack.push(c);
}
c = c.getNext();
}
return conflictStack;
}
/**
*
* @param lexeme
* @param l
*/
private void backPath(Lexeme l , LexemePath option){
while(option.checkCross(l)){
option.removeTail();
}
}
}

@ -0,0 +1,168 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*/
package com.luoo.music.ik.core;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import com.luoo.music.ik.cfg.Configuration;
import com.luoo.music.ik.cfg.DefaultConfig;
import com.luoo.music.ik.dic.Dictionary;
/**
* IK
*
*/
public final class IKSegmenter {
//字符窜reader
private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文
private AnalyzeContext context;
//分词处理器列表
private List<ISegmenter> segmenters;
//分词歧义裁决器
private IKArbitrator arbitrator;
/**
* IK
* @param input
* @param useSmart true使
*
*
*
*/
public IKSegmenter(Reader input , boolean useSmart){
this.input = input;
this.cfg = DefaultConfig.getInstance();
this.cfg.setUseSmart(useSmart);
this.init();
}
/**
* IK
* @param input
* @param cfg 使Configuration
*
*/
public IKSegmenter(Reader input , Configuration cfg){
this.input = input;
this.cfg = cfg;
this.init();
}
/**
*
*/
private void init(){
//初始化词典单例
Dictionary.initial(this.cfg);
//初始化分词上下文
this.context = new AnalyzeContext(this.cfg);
//加载子分词器
this.segmenters = this.loadSegmenters();
//加载歧义裁决器
this.arbitrator = new IKArbitrator();
}
/**
*
* @return List<ISegmenter>
*/
private List<ISegmenter> loadSegmenters(){
List<ISegmenter> segmenters = new ArrayList<ISegmenter>(4);
//处理字母的子分词器
segmenters.add(new LetterSegmenter());
//处理中文数量词的子分词器
segmenters.add(new CN_QuantifierSegmenter());
//处理中文词的子分词器
segmenters.add(new CJKSegmenter());
return segmenters;
}
/**
*
* @return Lexeme
* @throws IOException
*/
public synchronized Lexeme next()throws IOException{
Lexeme l = null;
while((l = context.getNextLexeme()) == null ){
/*
* readerbuffer
* readerbufferbuffer
*
*/
int available = context.fillBuffer(this.input);
if(available <= 0){
//reader已经读完
context.reset();
return null;
}else{
//初始化指针
context.initCursor();
do{
//遍历子分词器
for(ISegmenter segmenter : segmenters){
segmenter.analyze(context);
}
//字符缓冲区接近读完,需要读入新的字符
if(context.needRefillBuffer()){
break;
}
//向前移动指针
}while(context.moveCursor());
//重置子分词器,为下轮循环进行初始化
for(ISegmenter segmenter : segmenters){
segmenter.reset();
}
}
//对分词进行歧义处理
this.arbitrator.process(context, this.cfg.useSmart());
//将分词结果输出到结果集并处理未切分的单个CJK字符
context.outputToResult();
//记录本次分词的缓冲区位移
context.markBufferOffset();
}
return l;
}
/**
*
* @param input
*/
public synchronized void reset(Reader input) {
this.input = input;
context.reset();
for(ISegmenter segmenter : segmenters){
segmenter.reset();
}
}
}

@ -0,0 +1,46 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
/**
*
*
*/
interface ISegmenter {
/**
*
* @param context
*/
void analyze(AnalyzeContext context);
/**
*
*/
void reset();
}

@ -0,0 +1,296 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
import java.util.Arrays;
/**
*
*
*/
class LetterSegmenter implements ISegmenter {
//子分词器标签
static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
//链接符号
private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'};
//数字符号
private static final char[] Num_Connector = new char[]{',' , '.'};
/*
*
*
* start > -1
*/
private int start;
/*
*
* endLetterSign_Connector
*/
private int end;
/*
*
*/
private int englishStart;
/*
*
*/
private int englishEnd;
/*
*
*/
private int arabicStart;
/*
*
*/
private int arabicEnd;
LetterSegmenter(){
Arrays.sort(Letter_Connector);
Arrays.sort(Num_Connector);
this.start = -1;
this.end = -1;
this.englishStart = -1;
this.englishEnd = -1;
this.arabicStart = -1;
this.arabicEnd = -1;
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
*/
public void analyze(AnalyzeContext context) {
boolean bufferLockFlag = false;
//处理英文字母
bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
//处理阿拉伯字母
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
//处理混合字母(这个要放最后处理可以通过QuickSortSet排除重复)
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
//判断是否锁定缓冲区
if(bufferLockFlag){
context.lockBuffer(SEGMENTER_NAME);
}else{
//对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
}
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#reset()
*/
public void reset() {
this.start = -1;
this.end = -1;
this.englishStart = -1;
this.englishEnd = -1;
this.arabicStart = -1;
this.arabicEnd = -1;
}
/**
*
* windos2000 | linliangyi2005@gmail.com
* @param input
* @param context
* @return
*/
private boolean processMixLetter(AnalyzeContext context){
boolean needLock = false;
if(this.start == -1){//当前的分词器尚未开始处理字符
if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
//记录起始指针的位置,标明分词器进入处理状态
this.start = context.getCursor();
this.end = start;
}
}else{//当前的分词器正在处理字符
if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
//记录下可能的结束位置
this.end = context.getCursor();
}else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isLetterConnector(context.getCurrentChar())){
//记录下可能的结束位置
this.end = context.getCursor();
}else{
//遇到非Letter字符输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
}
}
//判断缓冲区是否已经读完
if(context.isBufferConsumed()){
if(this.start != -1 && this.end != -1){
//缓冲以读完,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
}
}
//判断是否锁定缓冲区
if(this.start == -1 && this.end == -1){
//对缓冲区解锁
needLock = false;
}else{
needLock = true;
}
return needLock;
}
/**
*
* @param context
* @return
*/
private boolean processEnglishLetter(AnalyzeContext context){
boolean needLock = false;
if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符
if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
//记录起始指针的位置,标明分词器进入处理状态
this.englishStart = context.getCursor();
this.englishEnd = this.englishStart;
}
}else {//当前的分词器正在处理英文字符
if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
//记录当前指针位置为结束位置
this.englishEnd = context.getCursor();
}else{
//遇到非English字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd= -1;
}
}
//判断缓冲区是否已经读完
if(context.isBufferConsumed()){
if(this.englishStart != -1 && this.englishEnd != -1){
//缓冲以读完,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd= -1;
}
}
//判断是否锁定缓冲区
if(this.englishStart == -1 && this.englishEnd == -1){
//对缓冲区解锁
needLock = false;
}else{
needLock = true;
}
return needLock;
}
/**
*
* @param context
* @return
*/
private boolean processArabicLetter(AnalyzeContext context){
boolean needLock = false;
if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符
if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
//记录起始指针的位置,标明分词器进入处理状态
this.arabicStart = context.getCursor();
this.arabicEnd = this.arabicStart;
}
}else {//当前的分词器正在处理数字字符
if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
//记录当前指针位置为结束位置
this.arabicEnd = context.getCursor();
}else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isNumConnector(context.getCurrentChar())){
//不输出数字,但不标记结束
}else{
////遇到非Arabic字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
}
}
//判断缓冲区是否已经读完
if(context.isBufferConsumed()){
if(this.arabicStart != -1 && this.arabicEnd != -1){
//生成已切分的词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
}
}
//判断是否锁定缓冲区
if(this.arabicStart == -1 && this.arabicEnd == -1){
//对缓冲区解锁
needLock = false;
}else{
needLock = true;
}
return needLock;
}
/**
*
* @param input
* @return
*/
private boolean isLetterConnector(char input){
int index = Arrays.binarySearch(Letter_Connector, input);
return index >= 0;
}
/**
*
* @param input
* @return
*/
private boolean isNumConnector(char input){
int index = Arrays.binarySearch(Num_Connector, input);
return index >= 0;
}
}

@ -0,0 +1,284 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
/**
* IK
*/
public class Lexeme implements Comparable<Lexeme>{
//lexemeType常量
//未知
public static final int TYPE_UNKNOWN = 0;
//英文
public static final int TYPE_ENGLISH = 1;
//数字
public static final int TYPE_ARABIC = 2;
//英文数字混合
public static final int TYPE_LETTER = 3;
//中文词元
public static final int TYPE_CNWORD = 4;
//中文单字
public static final int TYPE_CNCHAR = 64;
//日韩文字
public static final int TYPE_OTHER_CJK = 8;
//中文数词
public static final int TYPE_CNUM = 16;
//中文量词
public static final int TYPE_COUNT = 32;
//中文数量词
public static final int TYPE_CQUAN = 48;
//词元的起始位移
private int offset;
//词元的相对起始位置
private int begin;
//词元的长度
private int length;
//词元文本
private String lexemeText;
//词元类型
private int lexemeType;
public Lexeme(int offset , int begin , int length , int lexemeType){
this.offset = offset;
this.begin = begin;
if(length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
this.lexemeType = lexemeType;
}
/*
*
*
* @see java.lang.Object#equals(Object o)
*/
public boolean equals(Object o){
if(o == null){
return false;
}
if(this == o){
return true;
}
if(o instanceof Lexeme){
Lexeme other = (Lexeme)o;
if(this.offset == other.getOffset()
&& this.begin == other.getBegin()
&& this.length == other.getLength()){
return true;
}else{
return false;
}
}else{
return false;
}
}
/*
*
* @see java.lang.Object#hashCode()
*/
public int hashCode(){
int absBegin = getBeginPosition();
int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
}
/*
*
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Lexeme other) {
//起始位置优先
if(this.begin < other.getBegin()){
return -1;
}else if(this.begin == other.getBegin()){
//词元长度优先
if(this.length > other.getLength()){
return -1;
}else if(this.length == other.getLength()){
return 0;
}else {//this.length < other.getLength()
return 1;
}
}else{//this.begin > other.getBegin()
return 1;
}
}
public int getOffset() {
return offset;
}
public void setOffset(int offset) {
this.offset = offset;
}
public int getBegin() {
return begin;
}
/**
*
* @return int
*/
public int getBeginPosition(){
return offset + begin;
}
public void setBegin(int begin) {
this.begin = begin;
}
/**
*
* @return int
*/
public int getEndPosition(){
return offset + begin + length;
}
/**
*
* @return int
*/
public int getLength(){
return this.length;
}
public void setLength(int length) {
if(this.length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
}
/**
*
* @return String
*/
public String getLexemeText() {
if(lexemeText == null){
return "";
}
return lexemeText;
}
public void setLexemeText(String lexemeText) {
if(lexemeText == null){
this.lexemeText = "";
this.length = 0;
}else{
this.lexemeText = lexemeText;
this.length = lexemeText.length();
}
}
/**
*
* @return int
*/
public int getLexemeType() {
return lexemeType;
}
/**
*
* @return String
*/
public String getLexemeTypeString(){
switch(lexemeType) {
case TYPE_ENGLISH :
return "ENGLISH";
case TYPE_ARABIC :
return "ARABIC";
case TYPE_LETTER :
return "LETTER";
case TYPE_CNWORD :
return "CN_WORD";
case TYPE_CNCHAR :
return "CN_CHAR";
case TYPE_OTHER_CJK :
return "OTHER_CJK";
case TYPE_COUNT :
return "COUNT";
case TYPE_CNUM :
return "TYPE_CNUM";
case TYPE_CQUAN:
return "TYPE_CQUAN";
default :
return "UNKONW";
}
}
public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType;
}
/**
*
* @param l
* @param lexemeType
* @return boolean
*/
public boolean append(Lexeme l , int lexemeType){
if(l != null && this.getEndPosition() == l.getBeginPosition()){
this.length += l.getLength();
this.lexemeType = lexemeType;
return true;
}else {
return false;
}
}
/**
*
*/
public String toString(){
StringBuffer strbuf = new StringBuffer();
strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
strbuf.append(" : ").append(this.lexemeText).append(" : \t");
strbuf.append(this.getLexemeTypeString());
return strbuf.toString();
}
}

@ -0,0 +1,256 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
/**
* Lexeme
*/
class LexemePath extends QuickSortSet implements Comparable<LexemePath>{
//起始位置
private int pathBegin;
//结束
private int pathEnd;
//词元链的有效字符长度
private int payloadLength;
LexemePath(){
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
}
/**
* LexemePathLexeme
* @param lexeme
* @return
*/
boolean addCrossLexeme(Lexeme lexeme){
if(this.isEmpty()){
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
}else if(this.checkCross(lexeme)){
this.addLexeme(lexeme);
if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
}
this.payloadLength = this.pathEnd - this.pathBegin;
return true;
}else{
return false;
}
}
/**
* LexemePathLexeme
* @param lexeme
* @return
*/
boolean addNotCrossLexeme(Lexeme lexeme){
if(this.isEmpty()){
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
}else if(this.checkCross(lexeme)){
return false;
}else{
this.addLexeme(lexeme);
this.payloadLength += lexeme.getLength();
Lexeme head = this.peekFirst();
this.pathBegin = head.getBegin();
Lexeme tail = this.peekLast();
this.pathEnd = tail.getBegin() + tail.getLength();
return true;
}
}
/**
* Lexeme
* @return
*/
Lexeme removeTail(){
Lexeme tail = this.pollLast();
if(this.isEmpty()){
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
}else{
this.payloadLength -= tail.getLength();
Lexeme newTail = this.peekLast();
this.pathEnd = newTail.getBegin() + newTail.getLength();
}
return tail;
}
/**
*
* @param lexeme
* @return
*/
boolean checkCross(Lexeme lexeme){
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
|| (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength());
}
int getPathBegin() {
return pathBegin;
}
int getPathEnd() {
return pathEnd;
}
/**
* Path
* @return
*/
int getPayloadLength(){
return this.payloadLength;
}
/**
* LexemePath
* @return
*/
int getPathLength(){
return this.pathEnd - this.pathBegin;
}
/**
* X
* @return
*/
int getXWeight(){
int product = 1;
Cell c = this.getHead();
while( c != null && c.getLexeme() != null){
product *= c.getLexeme().getLength();
c = c.getNext();
}
return product;
}
/**
*
* @return
*/
int getPWeight(){
int pWeight = 0;
int p = 0;
Cell c = this.getHead();
while( c != null && c.getLexeme() != null){
p++;
pWeight += p * c.getLexeme().getLength() ;
c = c.getNext();
}
return pWeight;
}
LexemePath copy(){
LexemePath theCopy = new LexemePath();
theCopy.pathBegin = this.pathBegin;
theCopy.pathEnd = this.pathEnd;
theCopy.payloadLength = this.payloadLength;
Cell c = this.getHead();
while( c != null && c.getLexeme() != null){
theCopy.addLexeme(c.getLexeme());
c = c.getNext();
}
return theCopy;
}
public int compareTo(LexemePath o) {
//比较有效文本长度
if(this.payloadLength > o.payloadLength){
return -1;
}else if(this.payloadLength < o.payloadLength){
return 1;
}else{
//比较词元个数,越少越好
if(this.size() < o.size()){
return -1;
}else if (this.size() > o.size()){
return 1;
}else{
//路径跨度越大越好
if(this.getPathLength() > o.getPathLength()){
return -1;
}else if(this.getPathLength() < o.getPathLength()){
return 1;
}else {
//根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
if(this.pathEnd > o.pathEnd){
return -1;
}else if(pathEnd < o.pathEnd){
return 1;
}else{
//词长越平均越好
if(this.getXWeight() > o.getXWeight()){
return -1;
}else if(this.getXWeight() < o.getXWeight()){
return 1;
}else {
//词元位置权重比较
if(this.getPWeight() > o.getPWeight()){
return -1;
}else if(this.getPWeight() < o.getPWeight()){
return 1;
}
}
}
}
}
}
return 0;
}
public String toString(){
StringBuffer sb = new StringBuffer();
sb.append("pathBegin : ").append(pathBegin).append("\r\n");
sb.append("pathEnd : ").append(pathEnd).append("\r\n");
sb.append("payloadLength : ").append(payloadLength).append("\r\n");
Cell head = this.getHead();
while(head != null){
sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
head = head.getNext();
}
return sb.toString();
}
}

@ -0,0 +1,239 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.core;
/**
* IKLexem
*/
class QuickSortSet {
//链表头
private Cell head;
//链表尾
private Cell tail;
//链表的实际大小
private int size;
QuickSortSet(){
this.size = 0;
}
/**
*
* @param lexeme
*/
boolean addLexeme(Lexeme lexeme){
Cell newCell = new Cell(lexeme);
if(this.size == 0){
this.head = newCell;
this.tail = newCell;
this.size++;
return true;
}else{
if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合
return false;
}else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
this.tail.next = newCell;
newCell.prev = this.tail;
this.tail = newCell;
this.size++;
return true;
}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
this.head.prev = newCell;
newCell.next = this.head;
this.head = newCell;
this.size++;
return true;
}else{
//从尾部上逆
Cell index = this.tail;
while(index != null && index.compareTo(newCell) > 0){
index = index.prev;
}
if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
return false;
}else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
newCell.prev = index;
newCell.next = index.next;
index.next.prev = newCell;
index.next = newCell;
this.size++;
return true;
}
}
}
return false;
}
/**
*
* @return
*/
Lexeme peekFirst(){
if(this.head != null){
return this.head.lexeme;
}
return null;
}
/**
*
* @return Lexeme
*/
Lexeme pollFirst(){
if(this.size == 1){
Lexeme first = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return first;
}else if(this.size > 1){
Lexeme first = this.head.lexeme;
this.head = this.head.next;
this.size --;
return first;
}else{
return null;
}
}
/**
*
* @return
*/
Lexeme peekLast(){
if(this.tail != null){
return this.tail.lexeme;
}
return null;
}
/**
*
* @return Lexeme
*/
Lexeme pollLast(){
if(this.size == 1){
Lexeme last = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return last;
}else if(this.size > 1){
Lexeme last = this.tail.lexeme;
this.tail = this.tail.prev;
this.size--;
return last;
}else{
return null;
}
}
/**
*
* @return
*/
int size(){
return this.size;
}
/**
*
* @return
*/
boolean isEmpty(){
return this.size == 0;
}
/**
* lexeme
* @return
*/
Cell getHead(){
return this.head;
}
/**
*
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* QuickSortSet
*
*/
class Cell implements Comparable<Cell>{
private Cell prev;
private Cell next;
private Lexeme lexeme;
Cell(Lexeme lexeme){
if(lexeme == null){
throw new IllegalArgumentException("lexeme must not be null");
}
this.lexeme = lexeme;
}
public int compareTo(Cell o) {
return this.lexeme.compareTo(o.lexeme);
}
public Cell getPrev(){
return this.prev;
}
public Cell getNext(){
return this.next;
}
public Lexeme getLexeme(){
return this.lexeme;
}
}
}

@ -0,0 +1,330 @@
/**
*
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.dic;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
*
*/
class DictSegment implements Comparable<DictSegment>{
//公用字典表,存储汉字
private static final Map<Character , Character> charMap = new HashMap<Character , Character>(16 , 0.95f);
//数组大小上限
private static final int ARRAY_LENGTH_LIMIT = 3;
//Map存储结构
private Map<Character , DictSegment> childrenMap;
//数组方式存储结构
private DictSegment[] childrenArray;
//当前节点上存储的字符
private Character nodeChar;
//当前节点存储的Segment数目
//storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
private int storeSize = 0;
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
private int nodeState = 0;
DictSegment(Character nodeChar){
if(nodeChar == null){
throw new IllegalArgumentException("参数为空异常,字符不能为空");
}
this.nodeChar = nodeChar;
}
Character getNodeChar() {
return nodeChar;
}
/*
*
*/
boolean hasNextNode(){
return this.storeSize > 0;
}
/**
*
* @param charArray
* @return Hit
*/
Hit match(char[] charArray){
return this.match(charArray , 0 , charArray.length , null);
}
/**
*
* @param charArray
* @param begin
* @param length
* @return Hit
*/
Hit match(char[] charArray , int begin , int length){
return this.match(charArray , begin , length , null);
}
/**
*
* @param charArray
* @param begin
* @param length
* @param searchHit
* @return Hit
*/
Hit match(char[] charArray , int begin , int length , Hit searchHit){
if(searchHit == null){
//如果hit为空新建
searchHit= new Hit();
//设置hit的其实文本位置
searchHit.setBegin(begin);
}else{
//否则要将HIT状态重置
searchHit.setUnmatch();
}
//设置hit的当前处理位置
searchHit.setEnd(begin);
Character keyChar = new Character(charArray[begin]);
DictSegment ds = null;
//引用实例变量为本地变量,避免查询时遇到更新的同步问题
DictSegment[] segmentArray = this.childrenArray;
Map<Character , DictSegment> segmentMap = this.childrenMap;
//STEP1 在节点中查找keyChar对应的DictSegment
if(segmentArray != null){
//在数组中查找
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment);
if(position >= 0){
ds = segmentArray[position];
}
}else if(segmentMap != null){
//在map中查找
ds = (DictSegment)segmentMap.get(keyChar);
}
//STEP2 找到DictSegment判断词的匹配状态是否继续递归还是返回结果
if(ds != null){
if(length > 1){
//词未匹配完,继续往下搜索
return ds.match(charArray, begin + 1 , length - 1 , searchHit);
}else if (length == 1){
//搜索最后一个char
if(ds.nodeState == 1){
//添加HIT状态为完全匹配
searchHit.setMatch();
}
if(ds.hasNextNode()){
//添加HIT状态为前缀匹配
searchHit.setPrefix();
//记录当前位置的DictSegment
searchHit.setMatchedDictSegment(ds);
}
return searchHit;
}
}
//STEP3 没有找到DictSegment 将HIT设置为不匹配
return searchHit;
}
/**
*
* @param charArray
*/
void fillSegment(char[] charArray){
this.fillSegment(charArray, 0 , charArray.length , 1);
}
/**
*
* @param charArray
*/
void disableSegment(char[] charArray){
this.fillSegment(charArray, 0 , charArray.length , 0);
}
/**
*
* @param charArray
* @param begin
* @param length
* @param enabled
*/
private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){
//获取字典表中的汉字对象
Character beginChar = new Character(charArray[begin]);
Character keyChar = charMap.get(beginChar);
//字典中没有该字,则将其添加入字典
if(keyChar == null){
charMap.put(beginChar, beginChar);
keyChar = beginChar;
}
//搜索当前节点的存储查询对应keyChar的keyChar如果没有则创建
DictSegment ds = lookforSegment(keyChar , enabled);
if(ds != null){
//处理keyChar对应的segment
if(length > 1){
//词元还没有完全加入词典树
ds.fillSegment(charArray, begin + 1, length - 1 , enabled);
}else if (length == 1){
//已经是词元的最后一个char,设置当前节点状态为enabled
//enabled=1表明一个完整的词enabled=0表示从词典中屏蔽当前词
ds.nodeState = enabled;
}
}
}
/**
* keyCharsegment *
* @param keyChar
* @param create =1segment ; =0null
* @return
*/
private DictSegment lookforSegment(Character keyChar , int create){
DictSegment ds = null;
if(this.storeSize <= ARRAY_LENGTH_LIMIT){
//获取数组容器,如果数组未创建则创建数组
DictSegment[] segmentArray = getChildrenArray();
//搜寻数组
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment);
if(position >= 0){
ds = segmentArray[position];
}
//遍历数组后没有找到对应的segment
if(ds == null && create == 1){
ds = keySegment;
if(this.storeSize < ARRAY_LENGTH_LIMIT){
//数组容量未满,使用数组存储
segmentArray[this.storeSize] = ds;
//segment数目+1
this.storeSize++;
Arrays.sort(segmentArray , 0 , this.storeSize);
}else{
//数组容量已满切换Map存储
//获取Map容器如果Map未创建,则创建Map
Map<Character , DictSegment> segmentMap = getChildrenMap();
//将数组中的segment迁移到Map中
migrate(segmentArray , segmentMap);
//存储新的segment
segmentMap.put(keyChar, ds);
//segment数目+1 必须在释放数组前执行storeSize++ 确保极端情况下,不会取到空的数组
this.storeSize++;
//释放当前的数组引用
this.childrenArray = null;
}
}
}else{
//获取Map容器如果Map未创建,则创建Map
Map<Character , DictSegment> segmentMap = getChildrenMap();
//搜索Map
ds = (DictSegment)segmentMap.get(keyChar);
if(ds == null && create == 1){
//构造新的segment
ds = new DictSegment(keyChar);
segmentMap.put(keyChar , ds);
//当前节点存储segment数目+1
this.storeSize ++;
}
}
return ds;
}
/**
*
* 线
*/
private DictSegment[] getChildrenArray(){
if(this.childrenArray == null){
synchronized(this){
if(this.childrenArray == null){
this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
}
}
}
return this.childrenArray;
}
/**
* Map
* 线
*/
private Map<Character , DictSegment> getChildrenMap(){
if(this.childrenMap == null){
synchronized(this){
if(this.childrenMap == null){
this.childrenMap = new HashMap<Character , DictSegment>(ARRAY_LENGTH_LIMIT * 2,0.8f);
}
}
}
return this.childrenMap;
}
/**
* segmentMap
* @param segmentArray
*/
private void migrate(DictSegment[] segmentArray , Map<Character , DictSegment> segmentMap){
for(DictSegment segment : segmentArray){
if(segment != null){
segmentMap.put(segment.nodeChar, segment);
}
}
}
/**
* Comparable
* @param o
* @return int
*/
public int compareTo(DictSegment o) {
//对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar);
}
}

@ -0,0 +1,362 @@
/**
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package com.luoo.music.ik.dic;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.List;
import com.luoo.music.ik.cfg.Configuration;
/**
* ,
*/
public class Dictionary {
/*
*
*/
private static Dictionary singleton;
/*
*
*/
private DictSegment _MainDict;
/*
*
*/
private DictSegment _StopWordDict;
/*
*
*/
private DictSegment _QuantifierDict;
/**
*
*/
private Configuration cfg;
private Dictionary(Configuration cfg){
this.cfg = cfg;
this.loadMainDict();
this.loadStopWordDict();
this.loadQuantifierDict();
}
/**
*
* IK AnalyzerDictionary
* Dictionary
*
*
* @return Dictionary
*/
public static Dictionary initial(Configuration cfg){
if(singleton == null){
synchronized(Dictionary.class){
if(singleton == null){
singleton = new Dictionary(cfg);
return singleton;
}
}
}
return singleton;
}
/**
*
* @return Dictionary
*/
public static Dictionary getSingleton(){
if(singleton == null){
throw new IllegalStateException("词典尚未初始化请先调用initial方法");
}
return singleton;
}
/**
*
* @param words Collection<String>
*/
public void addWords(Collection<String> words){
if(words != null){
for(String word : words){
if (word != null) {
//批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
*
* @param words
*/
public void disableWords(Collection<String> words){
if(words != null){
for(String word : words){
if (word != null) {
//批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
*
* @param charArray
* @return Hit
*/
public Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray);
}
/**
*
* @param charArray
* @param begin
* @param length
* @return Hit
*/
public Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length);
}
/**
*
* @param charArray
* @param begin
* @param length
* @return Hit
*/
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* HitDictSegment
* @param charArray
* @param currentIndex
* @param matchedHit
* @return Hit
*/
public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit);
}
/**
*
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
}
/**
*
*/
private void loadMainDict(){
//建立一个主词典实例
_MainDict = new DictSegment((char)0);
//读取主词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary());
if(is == null){
throw new RuntimeException("Main Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
//加载扩展词典
this.loadExtDict();
}
/**
*
*/
private void loadExtDict(){
//加载扩展词典配置
List<String> extDictFiles = cfg.getExtDictionarys();
if(extDictFiles != null){
InputStream is = null;
for(String extDictName : extDictFiles){
//读取扩展词典文件
System.out.println("加载扩展词典:" + extDictName);
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
//如果找不到扩展的字典,则忽略
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中
//System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
*
*/
private void loadStopWordDict(){
//建立一个主词典实例
_StopWordDict = new DictSegment((char)0);
//加载扩展停止词典
List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys();
if(extStopWordDictFiles != null){
InputStream is = null;
for(String extStopWordDictName : extStopWordDictFiles){
System.out.println("加载扩展停止词典:" + extStopWordDictName);
//读取扩展词典文件
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
//如果找不到扩展的字典,则忽略
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//System.out.println(theWord);
//加载扩展停止词典数据到内存中
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
*
*/
private void loadQuantifierDict(){
//建立一个量词典实例
_QuantifierDict = new DictSegment((char)0);
//读取量词词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
if(is == null){
throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

@ -0,0 +1,117 @@
/**
*
* IK 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.dic;
/**
*
*/
public class Hit {
//Hit不匹配
private static final int UNMATCH = 0x00000000;
//Hit完全匹配
private static final int MATCH = 0x00000001;
//Hit前缀匹配
private static final int PREFIX = 0x00000010;
//该HIT当前状态默认未匹配
private int hitState = UNMATCH;
//记录词典匹配过程中,当前匹配到的词典分支节点
private DictSegment matchedDictSegment;
/*
*
*/
private int begin;
/*
*
*/
private int end;
/**
*
*/
public boolean isMatch() {
return (this.hitState & MATCH) > 0;
}
/**
*
*/
public void setMatch() {
this.hitState = this.hitState | MATCH;
}
/**
*
*/
public boolean isPrefix() {
return (this.hitState & PREFIX) > 0;
}
/**
*
*/
public void setPrefix() {
this.hitState = this.hitState | PREFIX;
}
/**
*
*/
public boolean isUnmatch() {
return this.hitState == UNMATCH ;
}
/**
*
*/
public void setUnmatch() {
this.hitState = UNMATCH;
}
public DictSegment getMatchedDictSegment() {
return matchedDictSegment;
}
public void setMatchedDictSegment(DictSegment matchedDictSegment) {
this.matchedDictSegment = matchedDictSegment;
}
public int getBegin() {
return begin;
}
public void setBegin(int begin) {
this.begin = begin;
}
public int getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,316 @@
世纪
位数
像素
克拉
公亩
公克
公分
公升
公尺
公担
公斤
公里
公顷
分钟
分米
加仑
千克
千米
厘米
周年
小时
平方
平方公尺
平方公里
平方分米
平方厘米
平方码
平方米
平方英寸
平方英尺
平方英里
平米
年代
年级
月份
毫升
毫米
毫克
海里
点钟
盎司
秒钟
立方公尺
立方分米
立方厘米
立方码
立方米
立方英寸
立方英尺
英亩
英寸
英尺
英里
阶段

@ -0,0 +1,76 @@
/**
* IK 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.luoo.music.ik.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
/**
* IKLucene Analyzer
* Lucene 4.0
*/
public final class IKAnalyzer extends Analyzer{
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IKLucene Analyzer
*
*
*/
public IKAnalyzer(){
this(false);
}
/**
* IKLucene Analyzer
*
* @param useSmart true
*/
public IKAnalyzer(boolean useSmart){
super();
this.useSmart = useSmart;
}
/**
* Analyzer
*/
@Override
protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
return new TokenStreamComponents(_IKTokenizer);
}
}

@ -0,0 +1,114 @@
/**
* IK 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* (linliangyi2005@gmail.com)
* 2012
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package com.luoo.music.ik.lucene;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.luoo.music.ik.core.IKSegmenter;
import com.luoo.music.ik.core.Lexeme;
/**
* IK Lucene Tokenizer
* Lucene 4.0
*/
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}

@ -1,5 +1,7 @@
package com.luoo.music.service;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDateTime;
@ -15,6 +17,11 @@ import java.util.UUID;
import javax.annotation.PostConstruct;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.connection.RedisZSetCommands.Limit;
import org.springframework.data.redis.connection.RedisZSetCommands.Range;
@ -24,6 +31,8 @@ import org.springframework.util.StringUtils;
import com.luoo.music.dao.JournalDao;
import com.luoo.music.dao.JournalSongDao;
import com.luoo.music.dao.SongInfoDao;
import com.luoo.music.ik.lucene.IKAnalyzer;
import enums.DateTimePatternEnum;
import lombok.SneakyThrows;
@ -40,8 +49,42 @@ public class SearchService {
@Autowired
private JournalSongDao journalSongDao;
@Autowired
private SongInfoDao songInfoDao;
@PostConstruct
private void init() {
//todo: 添加定时任务 填充搜索补全 zset
/*
* journalSongDao.findAll().parallelStream().forEach(s->{ String
* name=s.getName(); String artist=s.getArtist(); String album=s.getAlbum();
* if(isNeedReplace(name)||isNeedReplace(artist)||isNeedReplace(album)) {
* s.setName(name.replaceAll("&#39;", "'").replaceAll("&amp;", "&"));
* s.setArtist(artist.replaceAll("&#39;", "'").replaceAll("&amp;", "&"));
* s.setAlbum(album.replaceAll("&#39;", "'").replaceAll("&amp;", "&"));
* journalSongDao.save(s); }
*
* });
*/
/*
* journalDao.findValidJournals().parallelStream().forEach(j->{
* addIKKeyWord(j.getJournalNo()); addIKKeyWord(j.getTitle()); });
*
*
*
* journalSongDao.findAll().parallelStream().forEach(s->{
*
* addIKKeyWord(s.getName()); addIKKeyWord(s.getArtist());
* addIKKeyWord(s.getAlbum()); });
*
*
* System.exit(-1);
*/
/*
* DateTimeFormatter formatter =
* DateTimeFormatter.ofPattern(DateTimePatternEnum.YYYY_MM_DD_HH_MM_SS.
@ -76,27 +119,73 @@ public class SearchService {
}
private boolean isNeedReplace(String name) {
return name.contains("&#39;")||name.contains("&amp;");
}
public List<String> autoComplete(String query, int limit) {
Set<String> values=redisTemplate.opsForZSet().rangeByScore(REDIS_AUTO_COMPLETE+query, 0, Double.MAX_VALUE, 0, limit);
return new ArrayList<>(values);
}
@SneakyThrows
public void addKeyWord(String keyword) {
private void addIKKeyWord(String keyword) {
if(!StringTools.isEmpty(keyword)) {
for(int i=1;i<keyword.length()+1;i++) {
String sub=keyword.substring(0, i);
String encodedString=new String(sub.getBytes(),"UTF-8");
redisTemplate.opsForZSet().add(REDIS_AUTO_COMPLETE+encodedString,keyword,0);
}
keyword=keyword.trim();
//构建IK分词器使用smart分词模式
Analyzer analyzer = new IKAnalyzer(true);
//获取Lucene的TokenStream对象
TokenStream ts = null;
try {
ts = analyzer.tokenStream("myfield", new StringReader(keyword));
//获取词元位置属性
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
//获取词元文本属性
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
//获取词元文本属性
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
//重置TokenStream重置StringReader
ts.reset();
//迭代获取分词结果
while (ts.incrementToken()) {
String token=new String(term.toString().getBytes(),"UTF-8");
for(int i=1;i<token.length()+1;i++) {
String sub=token.substring(0, i);
String encodedString=new String(sub.getBytes(),"UTF-8");
redisTemplate.opsForZSet().add(REDIS_AUTO_COMPLETE+encodedString,keyword,0);
String encodedStringLowerCase=encodedString.toLowerCase();
if(!encodedString.equals(encodedStringLowerCase)) {
redisTemplate.opsForZSet().add(REDIS_AUTO_COMPLETE+encodedStringLowerCase,keyword,0);
}
}
}
//关闭TokenStream关闭StringReader
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
} catch (IOException e) {
e.printStackTrace();
} finally {
//释放TokenStream的所有资源
if(ts != null){
try {
ts.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
private void searchKey(String query) {
//redisTemplate.opsForZSet().intersectAndStore(query, null, query)
Set<String> values=redisTemplate.opsForZSet().reverseRangeByScore(REDIS_AUTO_COMPLETE+query, 0, Double.MAX_VALUE, 0, 10);
//Set<String> values=redisTemplate.opsForZSet().rangeWithScores(REDIS_AUTO_COMPLETE+query, 0,10);
//Set<String> values=redisTemplate.opsForZSet().rangeByLex(REDIS_AUTO_COMPLETE+query, Range.unbounded(), Limit.limit().count(10));
//Set<String> values=redisTemplate.opsForZSet().range(REDIS_AUTO_COMPLETE+query, 0, -1);
values.forEach(System.out::println);
}
}

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic;</entry>
</properties>

@ -0,0 +1,5 @@
诛仙
诛仙2
梦幻诛仙
梦幻诛仙2
大罗法咒

@ -0,0 +1,65 @@
a
s
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
such
that
the
their
then
there
these
they
this
to
was
will
with
使
Loading…
Cancel
Save