一种根据批准文号匹配条码的方法

2020-03-02  本文已影响0人  EasyNetCN

用户商品信息和标准库匹配是常见的业务,下面的例子(含代码,非正式,只供参考),提供了一种根据商品的批准文号,利用标准库数据,匹配用户商品的条码的一种方法。

数据基本结构和状况

1:标准数据由于是其他方提供,为了维护方便,标准数据保持不变
2:根据标准数据因素,创建对应的自己的标准数据库表,这张表中只包含需要标准化的信息字段。根据前文 https://www.jianshu.com/p/f646dec59f5d,提到的思路进行标准化数据处理
3:条码作为主键,条码与批准文号是一对多的关系。所以在本需要中,需要找到唯一一个匹配的商品条码

匹配基本逻辑和规则

1:假如存在批准文号对应的唯一条形码,则直接匹配成功
2:假如存在多个,则分析出优先级规则,降级逐步处理,直到最后找的唯一的一个匹配条形码,或者在所有规则中不符合

以下是python代码,仅供参考

import pandas as pd
import re
import sqlalchemy
from sqlalchemy import text

replaces =[
    {"src":"毫克","type":1,"dest":"mg"},
    {"src":"克","type":1,"dest":"g"},
    {"src":"毫升","type":1,"dest":"ml"},
    {"src":"升","type":1,"dest":"l"},
    {"src":"公斤","type":1,"dest":"kg"},
    {"src":"。","type":1,"dest":" "},
    {"src":",","type":1,"dest":" "},
    {"src":"x","type":1,"dest":"*"},
    {"src":"×","type":1,"dest":"*"},
    {"src":" +","type":2,"dest":" "}
]

def sbc_to_dbc(ustring):
    ss = []
    
    for s in ustring:
        rstring = ""
        
        for uchar in s:
            inside_code = ord(uchar)
            
            if inside_code == 12288:  # 全角空格直接转换
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # 全角字符(除空格)根据关系转化
                inside_code -= 65248
                
            rstring += chr(inside_code)
            
        ss.append(rstring)
    return ''.join(ss)

def text_replace(txt):
    for item in replaces:
        replace_type = item["type"]
        src = item["src"]
        dest = item["dest"]
        
        if replace_type == 1:
            txt = txt.replace(src,dest)
        elif replace_type == 2:
            txt = re.sub(src,dest,txt)
    
    return txt

def normalize(txt):
    return text_replace(sbc_to_dbc(txt.strip().lower()))

def update(conn,k,row):
    conn.execute(text('UPDATE goods_src_data SET standard_goods_id=:id_norm WHERE id=:id'),{"id":int(k),"id_norm":int(row.id_norm)})

conn_str = ""

src_data = pd.read_sql(sql="SELECT a.*,b.id AS id_norm,b.name AS name_norm,b.form AS form_norm,b.product_manufacturer AS product_manufacturer_norm FROM `goods_src_data` a JOIN goods_norm b ON a.approval_number=b.approval_number WHERE a.approval_number<>'' AND a.approval_number<>'-' AND standard_goods_id=0 ORDER BY a.id",con=conn_str)

print(len(src_data))

group_data = src_data.groupby("id")

print(len(group_data))

engine = sqlalchemy.create_engine(conn_str)
conn = engine.connect()


counter=0
j=0

d={}
for k,v in group_data:
    if len(v)==1:
        row = v.iloc[0]
        #print(row)
        conn.execute(text('UPDATE goods_src_data SET standard_goods_id=:id_norm WHERE id=:id'),{"id":int(k),"id_norm":int(row.id_norm)})
    else:
        filter_data = v[((v["name"]==v["name_norm"]) | (v.generic_name==v["name_norm"])) & (v.specification==v.form_norm) & (v.goods_manufacturer_name==v.product_manufacturer_norm)]
        
        l = len(filter_data)
        
        if l==1:
            ##print(filter_data.id,filter_data.id_norm)
            r=filter_data.iloc[0]
            d[r.id]=r.id_norm
            update(conn,r.id,r)
            counter+=1
        
        if l!=1:
            filter_data = v[((v["name"]==v["name_norm"]) | (v.generic_name==v["name_norm"])) & (v.specification==v.form_norm)]
            
            l = len(filter_data)
            
            if l==1:
                ##print(filter_data.id,filter_data.id_norm)
                r=filter_data.iloc[0]
                d[r.id]=r.id_norm
                update(conn,r.id,r)
                counter+=1
                
        if l!=1:
            filter_data = v[(v.specification==v.form_norm)]
            
            l = len(filter_data)
            
            if l==1:
                ##print(filter_data.id,filter_data.id_norm)
                r=filter_data.iloc[0]
                d[r.id]=r.id_norm
                update(conn,r.id,r)
                counter+=1
                
        if l!=1:
            filter_data = v
            arr = []
            
            for i,r in filter_data.iterrows():
                if (r.specification in r.form_norm) or (r.form_norm in r.specification):
                    arr.append(r)
                    
                    if len(arr)>1:
                        break
                        
            
            l = len(arr)
            
            if l>0:
                j+=1
            
            if l==1:
                ##print(filter_data.id,filter_data.id_norm)
                r=arr[0]
                d[r.id]=r.id_norm
                update(conn,r.id,r)
                counter+=1
                
            arr = []
            
            for i,r in filter_data.iterrows():
                if r.specification in r.form_norm:
                    arr.append(r)
                    
                    if len(arr)>1:
                        break
                        
            
            l = len(arr)
            
            if l>0:
                j+=1
            
            if l==1:
                ##print(filter_data.id,filter_data.id_norm)
                r=arr[0]
                d[r.id]=r.id_norm
                update(conn,r.id,r)
                counter+=1
                
            for i,r in filter_data.iterrows():
                if r.form_norm in r.specification:
                    arr.append(r)
                    
                    if len(arr)>1:
                        break
                        
            
            l = len(arr)
            
            if l>0:
                j+=1
            
            if l==1:
                ##print(filter_data.id,filter_data.id_norm)
                r=arr[0]
                d[r.id]=r.id_norm
                update(conn,r.id,r)
                counter+=1
                
    
        

conn.close()
print("counter:",counter,"j:",j)
print(len(d))
上一篇下一篇

猜你喜欢

热点阅读