20190722工作进展
-
rm -rf ../../origin_deep_cluster_odps_8.tar.gz
tar -cvzf ../../origin_deep_cluster_odps_8.tar.gz * -
商品的title页数只要前几页
表在这里:hs_tmp_dssm_1
去重 create table hs_tmp_dssm_2 as select distinct * from hs_tmp_dssm_1;
-
得到正样本
create table hs_tmp_22 as select se_keyword_ws, title_ws, 1 as label from hs_tmp_dssm_2; -
得到负样本
create table hs_tmp_24 as select se_keyword, count(*) as freq from hs_tmp_dssm_2 group by se_keyword order by freq;
create table hs_tmp_dssm_3 as select b.index, a.* from
(select * from hs_tmp_dssm_2)a join (select * from hs_tmp_25)b on a.se_keyword == b.se_keyword;
create table hs_tmp_dssm_4 as select index, title_ws from hs_tmp_dssm_3;
rename hs_udf_2 hs_udf_3 hs_udf_2*
add table hs_tmp_dssm_7 as hs_table_list_2;
add py /home/hengsong/hs_udf_9.py;
CREATE FUNCTION hs_negetive_samples_9 AS hs_udf_9.Processor USING hs_udf_9.py, hs_table_list_3;
create table hs_tmp_26 as
select graph_embedding:hs_negetive_samples_9(index, freq) as (index, title_id) from hs_tmp_29;
select graph_embedding:hs_negetive_samples_2(index, freq) as (index, title_id) from hs_tmp_27;
pai -name pytorch -project algo_public_dev -Dpython=3.6 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="test_query_with_title.py" -Dtables="odps://graph_embedding/tables/hs_tmp_dssm_6,odps://graph_embedding/tables/hs_tmp_27" -Doutputs="odps://graph_embedding/tables/hs_tmp_30" -Dbucket="oss://bucket-automl/" -Darn="acs:ram::1293303983251548:role/graph2018" -Dhost="cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="" -DworkerCount=1;
负采样运行程序:
- 统计title分词数量:
create table hs_title_length as select REGEXP_COUNT(title_ws, ' ') from hs_tmp_dssm_1;
总数量:9006956029
20: 483471878
<15: 5387341510
<18: 8160735058
取20更好一点
- id化
emb = tf.nn.embedding_lookup(emb_list[i], tf.string_to_hash_bucket(data_list[i], fea_max_list[i]))
- docker 多用户使用
sudo docker inspect --format "{{ .State.Pid }}" 44da6a70ba46
sudo nsenter --target 258026 --mount --uts --ipc --net --pid