Mahout | 贝叶斯算法
2019-07-02 本文已影响0人
icebreakeros
贝叶斯
基本思想
- 已知类条件概率密度参数表达式和先验概率
- 利用贝叶斯公式转换成后验概率
- 根据后验概率大小进行决策分类
实例
/usr/local/mahout/examples/bin/classify-20newsgroups.sh
if [[ -z "$MAHOUT_WORK_DIR" ]]; then
WORK_DIR=/tmp/mahout-work-${USER}
else
WORK_DIR=$MAHOUT_WORK_DIR
fi
mkdir -p ${WORK_DIR}
curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz \
-o ${WORK_DIR}/20news-bydate.tar.gz
mkdir -p ${WORK_DIR}/20news-bydate
cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
mkdir ${WORK_DIR}/20news-all
cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
hdfs dfs -mkdir -p ${WORK_DIR}
hdfs dfs -mkdir ${WORK_DIR}/20news-all
hdfs dfs -put ${WORK_DIR}/20news-all ${WORK_DIR}/
# Creating sequence files from 20newsgroups data
mahout seqdirectory \
-i ${WORK_DIR}/20news-all \
-o ${WORK_DIR}/20news-seq -ow
# Converting sequence files to vectors
mahout seq2sparse \
-i ${WORK_DIR}/20news-seq \
-o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
# Creating training and holdout set with \
# a random 80-20 split of the generated vector dataset
mahout split \
-i ${WORK_DIR}/20news-vectors/tfidf-vectors \
--trainingOutput ${WORK_DIR}/20news-train-vectors \
--testOutput ${WORK_DIR}/20news-test-vectors \
--randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
# Training Naive Bayes model
mahout trainnb \
-i ${WORK_DIR}/20news-train-vectors \
-o ${WORK_DIR}/model \
-li ${WORK_DIR}/labelindex \
-ow
# Self testing on training set
mahout testnb \
-i ${WORK_DIR}/20news-train-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing
# Testing on holdout set
mahout testnb \
-i ${WORK_DIR}/20news-test-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing