20190709工作进展
- 60个epoch测试
pai -name pytorch -project algo_public_dev -Dpython=3.6 -Dscript="file:///apsarapangu/disk1/hengsong.lhs/origin_deep_cluster_odps_5.tar.gz" -DentryFile="clusterUsingPrecenter.py" -Dtables="odps://graph_embedding/tables/hs_jingyan_query_related_video_pool_2_3,odps://graph_embedding/tables/hs_jingyan_query_related_top_query_1" -Doutputs="odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title5,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title5_0,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title5_1,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title5_2,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title5_3,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title5_4" -Dbucket="oss://bucket-automl/" -Darn="acs:ram::1293303983251548:role/graph2018" -Dhost="cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="" -DworkerCount=10;
- 60 epoch with same center
pai -name pytorch -project algo_public_dev -Dpython=3.6 -Dscript="file:///apsarapangu/disk1/hengsong.lhs/origin_deep_cluster_odps_5.tar.gz" -DentryFile="clusterUsingSameCenter.py" -Dtables="odps://graph_embedding/tables/hs_jingyan_query_related_video_pool_2_3,odps://graph_embedding/tables/hs_jingyan_query_related_top_query_1" -Doutputs="odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title6,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title6_0,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title6_1,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title6_2,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title6_3,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title6_4" -Dbucket="oss://bucket-automl/" -Darn="acs:ram::1293303983251548:role/graph2018" -Dhost="cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="" -DworkerCount=10;
- 60 epoch without same center 1000 class
pai -name pytorch -project algo_public_dev -Dpython=3.6 -Dscript="file:///apsarapangu/disk1/hengsong.lhs/origin_deep_cluster_odps_5.tar.gz" -DentryFile="clusterUsingPrecenter.py" -Dtables="odps://graph_embedding/tables/hs_jingyan_query_related_video_pool_2_3,odps://graph_embedding/tables/hs_jingyan_query_related_top_query_1" -Doutputs="odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title7,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title7_0,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title7_1,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title7_2,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title7_3,odps://graph_embedding/tables/hs_jingyan_query_cluster_result_title7_4" -Dbucket="oss://bucket-automl/" -Darn="acs:ram::1293303983251548:role/graph2018" -Dhost="cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="" -DworkerCount=10;
-
60 epoch without same center --result
泳衣类
儿童婴儿
女鞋-潮鞋
拖鞋女 -
title问题的终结:
pai -name pytorch -project algo_public_dev -Dpython=3.6 -Dscript="file:///apsarapangu/disk1/hengsong.lhs/origin_deep_cluster_odps_5.tar.gz" -DentryFile="test_query_with_title.py" -Dtables="odps://graph_embedding/tables/hs_jingyan_query_related_video_pool_2_3,odps://graph_embedding/tables/hs_jingyan_query_related_top_query_3" -Doutputs="odps://graph_embedding/tables/hs_query_title_6" -Dbucket="oss://bucket-automl/" -Darn="acs:ram::1293303983251548:role/graph2018" -Dhost="cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="" -DworkerCount=1;
pai -name pytorch -project algo_public_dev -Dpython=3.6 -Dscript="file:///apsarapangu/disk1/hengsong.lhs/origin_deep_cluster_odps_5.tar.gz" -DentryFile="test_query_with_title.py" -Dtables="odps://graph_embedding/tables/hs_tmp_video_emb_0,odps://graph_embedding/tables/hs_jingyan_query_related_top_query_128" -Doutputs="odps://graph_embedding/tables/hs_query_title_4" -Dbucket="oss://bucket-automl/" -Darn="acs:ram::1293303983251548:role/graph2018" -Dhost="cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="" -DworkerCount=1;
使用query和title的词向量计算欧氏距离取前K大就可以得到非常好的结果,比自动编码器的效果要好很多。。
hs_query_title_1:title和query的对应效果
hs_query_title_2:title之间的对应效果
hs_query_title_3:video_emb之间的对应效果
video_emb和query的对应效果暂时没有办法得到,因为使用alinlp得到的词向量只有50/100/200三种,而video_emb是128维的。。
create table if not exists graph_embedding.hs_heter_graph_embedding_out_nearest_neighbor_006(
node_id bigint,
emb string
) LIFECYCLE 14;
hs_heter_graph_embedding_out_nearest_neighbor_006
PAI -name am_vsearch_nearest_neighbor_014 -project algo_market
-Dcluster="{"worker":{"count":40,"gpu":100}}"
-Ddim=100
-Did_col="node_id"
-Dvector_col="emb"
-Dinput_slice=40
-Dtopk=100
-Dnprob=512
-Dmetric="l2"
-Dinput="odps://graph_embedding/tables/hs_heter_graph_embedding_video_recall_"
-Dquery="odps://graph_embedding/tables/hs_heter_graph_embedding_ave_info_"
-Doutputs="odps://graph_embedding/tables/hs_heter_graph_embedding_out_nearest_neighbor_006"
-DenableDynamicCluster=true -DmaxTrainingTimeInHour=60;
- 10k query结果
结果在hs_query_title_6中
- 查询分区表
select * from tbcdm.dim_tb_itm where ds=max_pt('tbcdm.dim_tb_itm') limit 10;