Skip to content

Commit

Permalink
添加倒排索引(还未完成)
Browse files Browse the repository at this point in the history
  • Loading branch information
liuzhibin-cn committed Oct 18, 2016
1 parent 0e74fd7 commit 2492cb7
Show file tree
Hide file tree
Showing 16 changed files with 536 additions and 79 deletions.
6 changes: 5 additions & 1 deletion images/readme
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,8 @@ LaTeX:
\sum_1^n(A_i \times B_i)
}{
\sqrt{ \sum_1^n{A_i^2} } \times \sqrt{ \sum_1^n{B_i^2} }
}
}

MySQL导出命令备份:
mysqldump -hlocalhost -uroot -p --comments --no-data --skip-add-locks --skip-compact --skip-disable-keys my_research > db-schema.sql
mysqldump -hlocalhost -uroot -p --no-create-db --no-create-info --skip-set-charset --skip-add-locks --skip-lock-tables --quick --complete-insert --extended-insert --compact --databases my_research --tables bas_region > db-init-data.sql
8 changes: 4 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
<profile>
<id>test</id>
<properties>
<db.host>127.0.0.1</db.host>
<db.name>my_research</db.name>
<db.host>localhost</db.host>
<db.name>rrs_research</db.name>
<db.user>root</db.user>
<db.password>dev</db.password>
<cache.path>/Users/richie/Documents/workspace_eclipse/cache-folder</cache.path>
<db.password>zxcZXC123,.</db.password>
<cache.path>/mnt/address/cache</cache.path>
<cache.vectors.in.memory>true</cache.vectors.in.memory>
</properties>
</profile>
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/com/rrs/rd/address/index/AcceptableItem.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package com.rrs.rd.address.index;

import com.alibaba.dubbo.common.utils.Stack;

/**
*
* @author Richie 刘志斌 [email protected]
* 2016年10月17日
*/
public interface AcceptableItem {
TermIndexItem accept(Stack<TermIndexItem> parents, TermIndexEntry child);
}
70 changes: 70 additions & 0 deletions src/main/java/com/rrs/rd/address/index/TermIndexBuilder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package com.rrs.rd.address.index;

import java.util.List;

import com.rrs.rd.address.persist.RegionEntity;
import com.rrs.rd.address.persist.RegionType;
import com.rrs.rd.address.similarity.TermType;

/**
* 线程安全。
* @author Richie 刘志斌 [email protected]
* 2016年10月17日
*/
public class TermIndexBuilder {
private TermIndexEntry indexRoot = new TermIndexEntry();

/**
* 为行政区划建立倒排索引。
* @param regions
* @return
*/
public TermIndexBuilder indexRegions(List<RegionEntity> regions){
this.indexRegions(regions, indexRoot);
return this;
}
private void indexRegions(List<RegionEntity> regions, TermIndexEntry index){
if(regions==null) return;
for(RegionEntity region : regions){
for(String name : region.orderedNameAndAlias())
index.buildIndex(name, 0, convertRegionType(region.getType()), region);
if(region.getChildren()!=null)
this.indexRegions(region.getChildren(), index);
}
}
private TermType convertRegionType(RegionType type){
switch(type){
case Province:
case ProvinceLevelCity1:
return TermType.Province;
case City:
case ProvinceLevelCity2:
return TermType.City;
case County:
case CityLevelCounty:
return TermType.County;
default:
}
return null;
}

/**
* 为忽略列表建立倒排索引
* @param ignoreList
* @return
*/
public TermIndexBuilder indexIgnorings(List<String> ignoreList){
if(ignoreList==null || ignoreList.isEmpty()) return this;
for(String str : ignoreList)
this.indexRoot.buildIndex(str, 0, TermType.Undefined, null);
return this;
}

public TermIndexEntry getTermIndex(){
return this.indexRoot;
}

public TermIndexQuery getQuery(){
return new TermIndexQuery(this);
}
}
69 changes: 69 additions & 0 deletions src/main/java/com/rrs/rd/address/index/TermIndexEntry.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package com.rrs.rd.address.index;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.rrs.rd.address.similarity.TermType;
import com.rrs.rd.address.utils.StringUtil;

/**
* 索引条目。
* @author Richie 刘志斌 [email protected]
* 2016年10月16日
*/
public class TermIndexEntry {
private String key;
private List<TermIndexItem> items;
private Map<Character, TermIndexEntry> children;

public void buildIndex(String text, int pos, TermType type, Object value){
if(text==null || text.isEmpty() || pos<0 || pos>=text.length()) return;

char c = text.charAt(pos);
if(this.children==null) this.children = new HashMap<Character, TermIndexEntry>(1);

TermIndexEntry entry = this.children.get(c);
if(entry==null) {
entry = new TermIndexEntry();
entry.key = StringUtil.head(text, pos + 1);
this.children.put(c, entry);
}

if(pos==text.length()-1) {
entry.addItem(type, value);
return;
}

entry.buildIndex(text, pos + 1, type, value);
}

public String getKey(){
return this.key;
}

public boolean hasItem(){
return this.items != null && !this.items.isEmpty();
}
public List<TermIndexItem> getItems(){
return this.items;
}
public TermIndexEntry addItem(TermIndexItem item){
if(this.items==null) this.items = new ArrayList<TermIndexItem>(1);
this.items.add(item);
return this;
}
public TermIndexEntry addItem(TermType type, Object value){
return this.addItem(new TermIndexItem(type, value));
}

public Map<Character, TermIndexEntry> getChildren(){
return this.children;
}

@Override
public String toString() {
return this.key;
}
}
38 changes: 38 additions & 0 deletions src/main/java/com/rrs/rd/address/index/TermIndexItem.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package com.rrs.rd.address.index;

import com.rrs.rd.address.similarity.TermType;

/**
* 索引对象。
* @author Richie 刘志斌 [email protected]
* 2016年10月16日
*/
public class TermIndexItem {
private TermType type;
private Object value;

public TermIndexItem(TermType type, Object value){
this.type = type;
this.value = value;
}

public TermType getType() {
return type;
}
public void setType(TermType type) {
this.type = type;
}

public Object getValue() {
return value;
}
public void setValue(Object value) {
this.value = value;
}

@Override
public String toString() {
if(this.value==null) return null;
return this.value.toString();
}
}
84 changes: 84 additions & 0 deletions src/main/java/com/rrs/rd/address/index/TermIndexQuery.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package com.rrs.rd.address.index;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import com.alibaba.dubbo.common.utils.Stack;

/**
* 非线程安全,只能单线程内使用。
* @author Richie 刘志斌 [email protected]
* 2016年10月17日
*/
public class TermIndexQuery {
private TermIndexBuilder builder = null;

public TermIndexQuery(TermIndexBuilder builder){
this.builder = builder;
}

/**
*
* @param text
* @param pos
* @return
*/
public List<TermIndexEntry> simpleQuery(String text, int pos){
return simpleQuery(text, pos, builder.getTermIndex().getChildren(), null);
}
private List<TermIndexEntry> simpleQuery(String text, int pos
, Map<Character, TermIndexEntry> entries, List<TermIndexEntry> foundList){
if(text==null || text.isEmpty() || entries==null || pos<0 || pos>=text.length()) return foundList;

char c = text.charAt(pos);
TermIndexEntry entry = entries.get(c);
if(entry==null) return foundList;

if(entry.hasItem()) foundList = merge(foundList, entry);
foundList = simpleQuery(text, pos + 1, entry.getChildren(), foundList);

return foundList;
}
private List<TermIndexEntry> merge(List<TermIndexEntry> target, TermIndexEntry source){
if(source==null) return target;
List<TermIndexEntry> result = target;
if(result==null) result = new ArrayList<TermIndexEntry>(1);
result.add(source);
return result;
}

public List<TermIndexItem> deepMostQuery(String text, int pos, AcceptableItem acceptable){
List<TermIndexItem> result = new ArrayList<TermIndexItem>();
Stack<TermIndexItem> stack = new Stack<TermIndexItem>();
doDeepMostQuery(text, pos, result, stack, acceptable);
return result;
}
private void doDeepMostQuery(String text, int pos, List<TermIndexItem> result, Stack<TermIndexItem> stack, AcceptableItem acceptable){
List<TermIndexEntry> list = simpleQuery(text, pos);
if(list==null) {
//一轮递归结束
if(stack.size()>result.size()){
result.clear();
for(int i=0; i<stack.size(); i++) result.add(stack.get(i));
}
return;
}
//继续递归匹配
for(int i=list.size()-1; i>=0; i--) {
TermIndexEntry matched = list.get(i);
TermIndexItem accepted = acceptable.accept(stack, matched);
if(accepted==null) {
//一轮递归结束
if(stack.size()>result.size()){
result.clear();
for(int j=0; j<stack.size(); j++) result.add(stack.get(j));
}
continue;
}
stack.push(accepted);
doDeepMostQuery(text, pos + matched.getKey().length(), result, stack, acceptable);
stack.pop();
}
}
}
58 changes: 58 additions & 0 deletions src/main/java/com/rrs/rd/address/interpret/AcceptableRegion.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package com.rrs.rd.address.interpret;

import com.alibaba.dubbo.common.utils.Stack;
import com.rrs.rd.address.index.AcceptableItem;
import com.rrs.rd.address.index.TermIndexEntry;
import com.rrs.rd.address.index.TermIndexItem;
import com.rrs.rd.address.persist.AddressPersister;
import com.rrs.rd.address.persist.RegionEntity;
import com.rrs.rd.address.persist.RegionType;
import com.rrs.rd.address.similarity.TermType;

public class AcceptableRegion implements AcceptableItem {
private AddressPersister persister = null;

public AcceptableRegion(AddressPersister persister){
this.persister = persister;
}

@Override
public TermIndexItem accept(Stack<TermIndexItem> parents, TermIndexEntry child) {
if(child==null) return null;
if(parents.isEmpty()){ //不存在父节点时,找一个级别最高的
if(!child.hasItem()) return null;
TermIndexItem topItem = null;
for(TermIndexItem item : child.getItems()){
if(item.getType()!=TermType.Province && item.getType()!=TermType.City && item.getType()!=TermType.County)
continue;
RegionEntity region = (RegionEntity)item.getValue();
if(region.getType()==RegionType.Undefined) continue;
if(topItem==null){
topItem = item;
continue;
}
if(region.getType().toValue()<((RegionEntity)topItem.getValue()).getType().toValue()){
topItem = item;
continue;
}
}
return topItem;
}
//child中的行政区域必须隶属于parents
for(int i=parents.size()-1; i>=0; i--) {
TermIndexItem pItem = parents.get(i);
RegionEntity pRegion = (RegionEntity)pItem.getValue();
for(TermIndexItem cItem : child.getItems()) {
RegionEntity cRegion = (RegionEntity)cItem.getValue();
if(pRegion.getId() == cRegion.getId()) return cItem; //相同,可接受(移除冗余时需要)
if(pRegion.getId() == cRegion.getParentId()) return cItem; //child直接隶属于parent
if(cRegion.getParentId()>1){
RegionEntity region = persister.getRegion(cRegion.getParentId());
if(region.getParentId()==pRegion.getId()) return cItem; //child间接隶属于parent,为中间缺一级的情况容错
}
}
}
return null;
}

}
14 changes: 14 additions & 0 deletions src/main/java/com/rrs/rd/address/interpret/AddressInterpreter.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.rrs.rd.address.index.TermIndexBuilder;
import com.rrs.rd.address.persist.AddressEntity;
import com.rrs.rd.address.persist.AddressPersister;
import com.rrs.rd.address.persist.RegionEntity;
Expand Down Expand Up @@ -178,6 +179,19 @@ public AddressEntity interpret(String addressText){
return addr;
}

public void extractRegion2(AddressEntity addr){
TermIndexBuilder manager = new TermIndexBuilder();
manager.indexRegions(persister.rootRegion().getChildren());
manager.indexIgnorings(invalidRegionNames);

// int pos = 0;
// List<TermIndexEntry> entries = manager.searchOne(addr.getText(), pos);
// while(entries!=null){
//
// entries = manager.searchOne(addr.getText(), pos);
// }
}

//***************************************************************************************
// 私有方法,出于单元测试目的部分方法设置为了public
//***************************************************************************************
Expand Down
Loading

0 comments on commit 2492cb7

Please sign in to comment.