文本地址智能识别组件(三)

前言

姓名+电话+地址识别,前面的文本地址智能识别组件(一)文本地址智能识别组件(二)可以满足地址识别的基本需求,但是经过不断的测试和真实场景的使用, 发现还是存在一些细节问题, 本着不到黄河心不死呕心沥血的实现了终极版本文本地址智能识别组件(三) . 通过参考快递公司的识别功能, 自己重新写了一下,功夫不负有心人, 终于达到了作者想要的效果.

方案

首先我们仓考之前的思路,通过处理字符串来达到匹配的结果. 电话肯定是可以获取到的, 地址的话, 我们单独去数据库获取
1.匹配需要识别文本中的电话号码
2.假设地址的省市区输入完全正确,我们直接匹配库里面的数据,然后对比省市区的名称,省市区名称前面的就是姓名, 如果姓名在详细地址后面,那就无法识别了
3.地址输入只有两级或者一级, 这种情况就需要多处理几次了, 首先查询省名称, 省名称为空责查询市名称,查询到市名称就根据市名称查询出对应的省名称, 在查询市名称下面的区名称, 然后和文本中的地址比较.
话不多说, 直接放大招

/**
	这是service的实现方法
	text传入参数为需要识别的文本信息
	姓名为匹配到的地址前面一截
*/

@Override
	public Map<String, Object> getTextRecognitionInfo(String text) {
		//去除所有标点符号和特殊符号
		String details = text.replaceAll("\\pP|\\pS|\\s+", "").trim();
		Map<String, Object> data = new HashMap<String, Object> ();

		//匹配文本中的电话号码
		String phone = "";
		Pattern phoneReg = Pattern.compile("\\d{7,17}");
		Matcher phoneMatcher = phoneReg.matcher(details);
		while (phoneMatcher.find()) {
			phone = phoneMatcher.group();
		}
		phoneReg = Pattern.compile("1[345678]\\d{9}");
		phoneMatcher = phoneReg.matcher(details);
		while(phoneMatcher.find()) {
			phone = phoneMatcher.group();
			details = details.replaceFirst(phone, "");
		}
		data.put("phone", phone);
		details = details.replaceFirst(phone, "");

		//直接完整匹配省市区
		RegionDTO region = regionMapper.getMatchingRecognition(details);
		//解析省
		int provinceLength = 0;
		String provinceTemp = "";
		//可以完整匹配出来地址
		if (region != null) {
			data.put("region", region);
			String name = details.substring(0, details.indexOf(region.getProvince()));
			data.put("name", name);
			details = details.replaceFirst(name, "");
			data.put("details", details.replaceFirst(region.getProvince()+region.getCity()+region.getCounty(), ""));
			return data;
		} else {
			region = new RegionDTO();
			//查询所有的省
			List<KdRegion> provinceList = regionMapper.getRegionListByType(1);
			for (KdRegion province : provinceList) {
				String tem;
				if (province.getRegionName().contains("省")) {
					tem = province.getRegionName().replace("省","");
				} else if (province.getRegionName().contains("市")) {
					tem = province.getRegionName().replace("市","");
				} else if (province.getRegionName().contains("自治区")) {
					tem = province.getRegionName().replace("自治区","");
				} else {
					tem = province.getRegionName();
				}
				if (details.contains(tem)) {
					if (tem.length() > provinceLength) {
						provinceLength = tem.length();
						provinceTemp = tem;
						region.setProvince(province.getRegionName());
						region.setProvinceCode(province.getRegionId());
					}
				}
			}
		}
		
		String name = details.substring(0, details.indexOf(provinceTemp));
		data.put("name", name);
		details = details.replaceFirst(name, "");
		details = details.replaceFirst(region.getProvince()==null ? "" : region.getProvince(), "").replaceFirst(provinceTemp, "");

		String cityCode = "";
		int cityLength = 0;
		String cityTemp = "";
		//解析市
		if (region.getProvince() != null) {
			List<KdRegion> cityList = regionMapper.queryregionParentId(region.getProvinceCode());
			for (KdRegion city : cityList) {
				String tem;
				if (city.getRegionName().contains("市")) {
					tem = city.getRegionName().replace("市","");
				} else if (city.getRegionName().contains("自治县")) {
					tem = city.getRegionName().replace("自治县","");
				} else if (city.getRegionName().contains("县")) {
					tem = city.getRegionName().replace("县","");
				} else if (city.getRegionName().contains("自治洲")) {
					tem = city.getRegionName().replace("自治洲","");
				} else if (city.getRegionName().contains("地区")) {
					tem = city.getRegionName().replace("地区","");
				} else if (city.getRegionName().contains("盟")) {
					tem = city.getRegionName().replace("盟","");
				} else {
					tem = city.getRegionName();
				}

				if (details.contains(tem)) {
					if (tem.length() > cityLength) {
						cityLength = tem.length();
						cityTemp = tem;
						cityCode = city.getRegionId();
						region.setCity(city.getRegionName());
					}
				}

			}
		} else {

			List<KdRegion> cityList = regionMapper.getRegionListByType(2);

			for (KdRegion city : cityList) {
				String tem;
				if (city.getRegionName().contains("市")) {
					tem = city.getRegionName().replace("市","");
				} else if (city.getRegionName().contains("自治县")) {
					tem = city.getRegionName().replace("自治县","");
				} else if (city.getRegionName().contains("自治洲")) {
					tem = city.getRegionName().replace("自治洲","");
				} else if (city.getRegionName().contains("县")) {
					tem = city.getRegionName().replace("县","");
				} else if (city.getRegionName().contains("盟")) {
					tem = city.getRegionName().replace("盟","");
				} else if (city.getRegionName().contains("地区")) {
					tem = city.getRegionName().replace("地区","");
				} else {
					tem = city.getRegionName();
				}

				if (details.contains(tem)) {
					if (tem.length() > cityLength) {
						cityLength = tem.length();
						cityTemp = tem;
						cityCode = city.getRegionId();
						region.setCity(city.getRegionName());
						region.setProvinceCode(city.getRegionParentId());
						region.setProvince(regionMapper.selectByPrimaryKey(city.getRegionParentId()).getRegionName());
					}
				}

			}

		}

		name = details.substring(0, details.indexOf(cityTemp));
		data.put("name", name);
		details = details.replaceFirst(name, "");
		details = details.replaceFirst(region.getCity()==null ? "" : region.getCity(), "").replaceFirst(cityTemp, "");

		//解析区
		int countyLength = 0;
		String countyTmp = "";
		if (region.getCity() != null) {
			List<KdRegion> countyList = regionMapper.queryregionParentId(cityCode);
			for (KdRegion county : countyList) {
				String tem;
				if (county.getRegionName().contains("市")) {
					tem = county.getRegionName().replace("市","");
				} else if (county.getRegionName().contains("区")) {
					tem = county.getRegionName().replace("区","");
				} else if (county.getRegionName().contains("县")) {
					tem = county.getRegionName().replace("县","");
				} else if (county.getRegionName().contains("镇")) {
					tem = county.getRegionName().replace("镇","");
				} else if (county.getRegionName().contains("旗")) {
					tem = county.getRegionName().replace("旗","");
				} else if (county.getRegionName().contains("街道")) {
					tem = county.getRegionName().replace("街道","");
				} else if (county.getRegionName().contains("乡")) {
					tem = county.getRegionName().replace("乡","");
				} else if (county.getRegionName().contains("农场")) {
					tem = county.getRegionName().replace("农场","");
				} else if (county.getRegionName().contains("兵团")) {
					tem = county.getRegionName().replace("兵团","");
				} else {
					tem = county.getRegionName();
				}

				if (details.contains(tem)) {
					if (tem.length() > countyLength) {
						countyLength = tem.length();
						countyTmp = tem;
						region.setCounty(county.getRegionName());
					}
				}

			}
		} else {

			List<KdRegion> cityList = regionMapper.queryregionParentId(region.getProvinceCode());

			for (KdRegion city : cityList) {
				List<KdRegion> countyList = regionMapper.queryregionParentId(city.getRegionId());

				for (KdRegion county : countyList) {
					String tem;
					if (county.getRegionName().contains("市")) {
						tem = county.getRegionName().replace("市","");
					} else if (county.getRegionName().contains("区")) {
						tem = county.getRegionName().replace("区","");
					} else if (county.getRegionName().contains("县")) {
						tem = county.getRegionName().replace("县","");
					} else if (county.getRegionName().contains("镇")) {
						tem = county.getRegionName().replace("镇","");
					} else if (county.getRegionName().contains("旗")) {
						tem = county.getRegionName().replace("旗","");
					} else if (county.getRegionName().contains("街道")) {
						tem = county.getRegionName().replace("街道","");
					} else if (county.getRegionName().contains("乡")) {
						tem = county.getRegionName().replace("乡","");
					} else if (county.getRegionName().contains("农场")) {
						tem = county.getRegionName().replace("农场","");
					} else if (county.getRegionName().contains("兵团")) {
						tem = county.getRegionName().replace("兵团","");
					} else {
						tem = county.getRegionName();
					}
					if (details.contains(tem)) {
						if (tem.length() > countyLength) {
							countyLength = tem.length();
							countyTmp = tem;
							region.setCity(city.getRegionName());
							region.setCounty(county.getRegionName());
						}
					}
				}
//				if (region.getCity() != null) {
//					break;
//				}

			}

		}
		
		name = details.substring(0, details.indexOf(countyTmp));
		data.put("name", name);
		details = details.replaceFirst(name, "");
		details = details.replaceFirst(region.getCounty()==null ? "" : region.getCounty(), "").replaceFirst(countyTmp, "");

		data.put("region", region);
		data.put("details", details);
		return data;
	}

注意事项

由于代码比较长, 然后业务主要针对我工作中的需求研发, 所以相关代码改动过, 查询地址的SQL也是根据现在的表结构处理, 可以参考相关思路进行改动, 本人亲测这个是识别率最高, 也最符合场景的.

码路镜头谁为峰?
VX18670040141POS机办理,信用卡办理

你可能感兴趣的:(搬砖之路,mysql,算法,java,自然语言处理)