forked from liuzhibin-cn/address-semantic-search
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0e74fd7
commit 2492cb7
Showing
16 changed files
with
536 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12 changes: 12 additions & 0 deletions
12
src/main/java/com/rrs/rd/address/index/AcceptableItem.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package com.rrs.rd.address.index; | ||
|
||
import com.alibaba.dubbo.common.utils.Stack; | ||
|
||
/** | ||
* | ||
* @author Richie 刘志斌 [email protected] | ||
* 2016年10月17日 | ||
*/ | ||
public interface AcceptableItem { | ||
TermIndexItem accept(Stack<TermIndexItem> parents, TermIndexEntry child); | ||
} |
70 changes: 70 additions & 0 deletions
70
src/main/java/com/rrs/rd/address/index/TermIndexBuilder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
package com.rrs.rd.address.index; | ||
|
||
import java.util.List; | ||
|
||
import com.rrs.rd.address.persist.RegionEntity; | ||
import com.rrs.rd.address.persist.RegionType; | ||
import com.rrs.rd.address.similarity.TermType; | ||
|
||
/** | ||
* 线程安全。 | ||
* @author Richie 刘志斌 [email protected] | ||
* 2016年10月17日 | ||
*/ | ||
public class TermIndexBuilder { | ||
private TermIndexEntry indexRoot = new TermIndexEntry(); | ||
|
||
/** | ||
* 为行政区划建立倒排索引。 | ||
* @param regions | ||
* @return | ||
*/ | ||
public TermIndexBuilder indexRegions(List<RegionEntity> regions){ | ||
this.indexRegions(regions, indexRoot); | ||
return this; | ||
} | ||
private void indexRegions(List<RegionEntity> regions, TermIndexEntry index){ | ||
if(regions==null) return; | ||
for(RegionEntity region : regions){ | ||
for(String name : region.orderedNameAndAlias()) | ||
index.buildIndex(name, 0, convertRegionType(region.getType()), region); | ||
if(region.getChildren()!=null) | ||
this.indexRegions(region.getChildren(), index); | ||
} | ||
} | ||
private TermType convertRegionType(RegionType type){ | ||
switch(type){ | ||
case Province: | ||
case ProvinceLevelCity1: | ||
return TermType.Province; | ||
case City: | ||
case ProvinceLevelCity2: | ||
return TermType.City; | ||
case County: | ||
case CityLevelCounty: | ||
return TermType.County; | ||
default: | ||
} | ||
return null; | ||
} | ||
|
||
/** | ||
* 为忽略列表建立倒排索引 | ||
* @param ignoreList | ||
* @return | ||
*/ | ||
public TermIndexBuilder indexIgnorings(List<String> ignoreList){ | ||
if(ignoreList==null || ignoreList.isEmpty()) return this; | ||
for(String str : ignoreList) | ||
this.indexRoot.buildIndex(str, 0, TermType.Undefined, null); | ||
return this; | ||
} | ||
|
||
public TermIndexEntry getTermIndex(){ | ||
return this.indexRoot; | ||
} | ||
|
||
public TermIndexQuery getQuery(){ | ||
return new TermIndexQuery(this); | ||
} | ||
} |
69 changes: 69 additions & 0 deletions
69
src/main/java/com/rrs/rd/address/index/TermIndexEntry.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package com.rrs.rd.address.index; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import com.rrs.rd.address.similarity.TermType; | ||
import com.rrs.rd.address.utils.StringUtil; | ||
|
||
/** | ||
* 索引条目。 | ||
* @author Richie 刘志斌 [email protected] | ||
* 2016年10月16日 | ||
*/ | ||
public class TermIndexEntry { | ||
private String key; | ||
private List<TermIndexItem> items; | ||
private Map<Character, TermIndexEntry> children; | ||
|
||
public void buildIndex(String text, int pos, TermType type, Object value){ | ||
if(text==null || text.isEmpty() || pos<0 || pos>=text.length()) return; | ||
|
||
char c = text.charAt(pos); | ||
if(this.children==null) this.children = new HashMap<Character, TermIndexEntry>(1); | ||
|
||
TermIndexEntry entry = this.children.get(c); | ||
if(entry==null) { | ||
entry = new TermIndexEntry(); | ||
entry.key = StringUtil.head(text, pos + 1); | ||
this.children.put(c, entry); | ||
} | ||
|
||
if(pos==text.length()-1) { | ||
entry.addItem(type, value); | ||
return; | ||
} | ||
|
||
entry.buildIndex(text, pos + 1, type, value); | ||
} | ||
|
||
public String getKey(){ | ||
return this.key; | ||
} | ||
|
||
public boolean hasItem(){ | ||
return this.items != null && !this.items.isEmpty(); | ||
} | ||
public List<TermIndexItem> getItems(){ | ||
return this.items; | ||
} | ||
public TermIndexEntry addItem(TermIndexItem item){ | ||
if(this.items==null) this.items = new ArrayList<TermIndexItem>(1); | ||
this.items.add(item); | ||
return this; | ||
} | ||
public TermIndexEntry addItem(TermType type, Object value){ | ||
return this.addItem(new TermIndexItem(type, value)); | ||
} | ||
|
||
public Map<Character, TermIndexEntry> getChildren(){ | ||
return this.children; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return this.key; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package com.rrs.rd.address.index; | ||
|
||
import com.rrs.rd.address.similarity.TermType; | ||
|
||
/** | ||
* 索引对象。 | ||
* @author Richie 刘志斌 [email protected] | ||
* 2016年10月16日 | ||
*/ | ||
public class TermIndexItem { | ||
private TermType type; | ||
private Object value; | ||
|
||
public TermIndexItem(TermType type, Object value){ | ||
this.type = type; | ||
this.value = value; | ||
} | ||
|
||
public TermType getType() { | ||
return type; | ||
} | ||
public void setType(TermType type) { | ||
this.type = type; | ||
} | ||
|
||
public Object getValue() { | ||
return value; | ||
} | ||
public void setValue(Object value) { | ||
this.value = value; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
if(this.value==null) return null; | ||
return this.value.toString(); | ||
} | ||
} |
84 changes: 84 additions & 0 deletions
84
src/main/java/com/rrs/rd/address/index/TermIndexQuery.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
package com.rrs.rd.address.index; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import com.alibaba.dubbo.common.utils.Stack; | ||
|
||
/** | ||
* 非线程安全,只能单线程内使用。 | ||
* @author Richie 刘志斌 [email protected] | ||
* 2016年10月17日 | ||
*/ | ||
public class TermIndexQuery { | ||
private TermIndexBuilder builder = null; | ||
|
||
public TermIndexQuery(TermIndexBuilder builder){ | ||
this.builder = builder; | ||
} | ||
|
||
/** | ||
* | ||
* @param text | ||
* @param pos | ||
* @return | ||
*/ | ||
public List<TermIndexEntry> simpleQuery(String text, int pos){ | ||
return simpleQuery(text, pos, builder.getTermIndex().getChildren(), null); | ||
} | ||
private List<TermIndexEntry> simpleQuery(String text, int pos | ||
, Map<Character, TermIndexEntry> entries, List<TermIndexEntry> foundList){ | ||
if(text==null || text.isEmpty() || entries==null || pos<0 || pos>=text.length()) return foundList; | ||
|
||
char c = text.charAt(pos); | ||
TermIndexEntry entry = entries.get(c); | ||
if(entry==null) return foundList; | ||
|
||
if(entry.hasItem()) foundList = merge(foundList, entry); | ||
foundList = simpleQuery(text, pos + 1, entry.getChildren(), foundList); | ||
|
||
return foundList; | ||
} | ||
private List<TermIndexEntry> merge(List<TermIndexEntry> target, TermIndexEntry source){ | ||
if(source==null) return target; | ||
List<TermIndexEntry> result = target; | ||
if(result==null) result = new ArrayList<TermIndexEntry>(1); | ||
result.add(source); | ||
return result; | ||
} | ||
|
||
public List<TermIndexItem> deepMostQuery(String text, int pos, AcceptableItem acceptable){ | ||
List<TermIndexItem> result = new ArrayList<TermIndexItem>(); | ||
Stack<TermIndexItem> stack = new Stack<TermIndexItem>(); | ||
doDeepMostQuery(text, pos, result, stack, acceptable); | ||
return result; | ||
} | ||
private void doDeepMostQuery(String text, int pos, List<TermIndexItem> result, Stack<TermIndexItem> stack, AcceptableItem acceptable){ | ||
List<TermIndexEntry> list = simpleQuery(text, pos); | ||
if(list==null) { | ||
//一轮递归结束 | ||
if(stack.size()>result.size()){ | ||
result.clear(); | ||
for(int i=0; i<stack.size(); i++) result.add(stack.get(i)); | ||
} | ||
return; | ||
} | ||
//继续递归匹配 | ||
for(int i=list.size()-1; i>=0; i--) { | ||
TermIndexEntry matched = list.get(i); | ||
TermIndexItem accepted = acceptable.accept(stack, matched); | ||
if(accepted==null) { | ||
//一轮递归结束 | ||
if(stack.size()>result.size()){ | ||
result.clear(); | ||
for(int j=0; j<stack.size(); j++) result.add(stack.get(j)); | ||
} | ||
continue; | ||
} | ||
stack.push(accepted); | ||
doDeepMostQuery(text, pos + matched.getKey().length(), result, stack, acceptable); | ||
stack.pop(); | ||
} | ||
} | ||
} |
58 changes: 58 additions & 0 deletions
58
src/main/java/com/rrs/rd/address/interpret/AcceptableRegion.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package com.rrs.rd.address.interpret; | ||
|
||
import com.alibaba.dubbo.common.utils.Stack; | ||
import com.rrs.rd.address.index.AcceptableItem; | ||
import com.rrs.rd.address.index.TermIndexEntry; | ||
import com.rrs.rd.address.index.TermIndexItem; | ||
import com.rrs.rd.address.persist.AddressPersister; | ||
import com.rrs.rd.address.persist.RegionEntity; | ||
import com.rrs.rd.address.persist.RegionType; | ||
import com.rrs.rd.address.similarity.TermType; | ||
|
||
public class AcceptableRegion implements AcceptableItem { | ||
private AddressPersister persister = null; | ||
|
||
public AcceptableRegion(AddressPersister persister){ | ||
this.persister = persister; | ||
} | ||
|
||
@Override | ||
public TermIndexItem accept(Stack<TermIndexItem> parents, TermIndexEntry child) { | ||
if(child==null) return null; | ||
if(parents.isEmpty()){ //不存在父节点时,找一个级别最高的 | ||
if(!child.hasItem()) return null; | ||
TermIndexItem topItem = null; | ||
for(TermIndexItem item : child.getItems()){ | ||
if(item.getType()!=TermType.Province && item.getType()!=TermType.City && item.getType()!=TermType.County) | ||
continue; | ||
RegionEntity region = (RegionEntity)item.getValue(); | ||
if(region.getType()==RegionType.Undefined) continue; | ||
if(topItem==null){ | ||
topItem = item; | ||
continue; | ||
} | ||
if(region.getType().toValue()<((RegionEntity)topItem.getValue()).getType().toValue()){ | ||
topItem = item; | ||
continue; | ||
} | ||
} | ||
return topItem; | ||
} | ||
//child中的行政区域必须隶属于parents | ||
for(int i=parents.size()-1; i>=0; i--) { | ||
TermIndexItem pItem = parents.get(i); | ||
RegionEntity pRegion = (RegionEntity)pItem.getValue(); | ||
for(TermIndexItem cItem : child.getItems()) { | ||
RegionEntity cRegion = (RegionEntity)cItem.getValue(); | ||
if(pRegion.getId() == cRegion.getId()) return cItem; //相同,可接受(移除冗余时需要) | ||
if(pRegion.getId() == cRegion.getParentId()) return cItem; //child直接隶属于parent | ||
if(cRegion.getParentId()>1){ | ||
RegionEntity region = persister.getRegion(cRegion.getParentId()); | ||
if(region.getParentId()==pRegion.getId()) return cItem; //child间接隶属于parent,为中间缺一级的情况容错 | ||
} | ||
} | ||
} | ||
return null; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.