20190722工作进展

2019-07-22 本文已影响0人 Songger

rm -rf ../../origin_deep_cluster_odps_8.tar.gz
tar -cvzf ../../origin_deep_cluster_odps_8.tar.gz *
商品的title页数只要前几页
表在这里：hs_tmp_dssm_1

去重 create table hs_tmp_dssm_2 as select distinct * from hs_tmp_dssm_1;

得到正样本
create table hs_tmp_22 as select se_keyword_ws, title_ws, 1 as label from hs_tmp_dssm_2;
得到负样本

create table hs_tmp_24 as select se_keyword, count(*) as freq from hs_tmp_dssm_2 group by se_keyword order by freq;

create table hs_tmp_dssm_3 as select b.index, a.* from
(select * from hs_tmp_dssm_2)a join (select * from hs_tmp_25)b on a.se_keyword == b.se_keyword;

create table hs_tmp_dssm_4 as select index, title_ws from hs_tmp_dssm_3;

rename hs_udf_2 hs_udf_3 hs_udf_2*
add table hs_tmp_dssm_7 as hs_table_list_2;
add py /home/hengsong/hs_udf_9.py;
CREATE FUNCTION hs_negetive_samples_9 AS hs_udf_9.Processor USING hs_udf_9.py, hs_table_list_3;

create table hs_tmp_26 as
select graph_embedding:hs_negetive_samples_9(index, freq) as (index, title_id) from hs_tmp_29;

select graph_embedding:hs_negetive_samples_2(index, freq) as (index, title_id) from hs_tmp_27;

pai -name pytorch -project algo_public_dev -Dpython=3.6 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="test_query_with_title.py" -Dtables="odps://graph_embedding/tables/hs_tmp_dssm_6,odps://graph_embedding/tables/hs_tmp_27" -Doutputs="odps://graph_embedding/tables/hs_tmp_30" -Dbucket="oss://bucket-automl/" -Darn="acs:ram::1293303983251548:role/graph2018" -Dhost="cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="" -DworkerCount=1;

负采样运行程序：

http://logview.odps.aliyun-inc.com:8080/logview/?h=http://service-corp.odps.aliyun-inc.com/api&p=graph_embedding&i=20190722122808349g5viw8y_087c9cf0_f3d1_4899_a192_3e3af02f5f11&token=VGszRTBMeWY5cWVGZHMyNVREb2F1NlZEdDJnPSxPRFBTX09CTzoxMjkzMzAzOTgzMjUxNTQ4LDE1NjQ0MDMyODgseyJTdGF0ZW1lbnQiOlt7IkFjdGlvbiI6WyJvZHBzOlJlYWQiXSwiRWZmZWN0IjoiQWxsb3ciLCJSZXNvdXJjZSI6WyJhY3M6b2RwczoqOnByb2plY3RzL2dyYXBoX2VtYmVkZGluZy9pbnN0YW5jZXMvMjAxOTA3MjIxMjI4MDgzNDlnNXZpdzh5XzA4N2M5Y2YwX2YzZDFfNDg5OV9hMTkyXzNlM2FmMDJmNWYxMSJdfV0sIlZlcnNpb24iOiIxIn0=

统计title分词数量：
create table hs_title_length as select REGEXP_COUNT(title_ws, ' ') from hs_tmp_dssm_1;
总数量：9006956029

20： 483471878
<15： 5387341510
<18： 8160735058
取20更好一点

id化

emb = tf.nn.embedding_lookup(emb_list[i], tf.string_to_hash_bucket(data_list[i], fea_max_list[i]))

docker 多用户使用

sudo docker inspect --format "{{ .State.Pid }}" 44da6a70ba46
sudo nsenter --target 258026 --mount --uts --ipc --net --pid

20190722工作进展

猜你喜欢

热点阅读