node 爬虫

2018-09-03  本文已影响20人  秋天de童话

clawer.js

const fs=require('fs');
const Mysql=require('mysql-pro');

const db=new Mysql({
  mysql: {
    host: 'localhost',
    port: 3306,
    user: 'root',
    password: 'admin',
    database: 'zhihu'
  }
});


const arr=JSON.parse(fs.readFileSync('.topics').toString());

let topics={}, topic_ID=1;
let authors={}, author_ID=1;
let questions={}, question_ID=1;
let answers={}, answer_ID=1;

arr.forEach(question=>{
  //topic
  question.topics=question.topices.map(json=>{
    let {title}=json;
    title=title.replace(/^\s+|\s+$/g, '');

    if(!topics[title]){
      topics[title]=topic_ID++;
    }

    return topics[title];
  }).join(',');

  //author
  [question.bestAnswer.author, ...question.answers.map(answer=>answer.author)].forEach((author,index)=>{
    let old_id=author.id;
    if(!authors[old_id]){
      authors[author.id]=author;
      author.id=question_ID++;
    }

    if(index==0){
      delete question.bestAnswer.author;
      question.bestAnswer.author_ID=author.id;
    }else{
      delete question.answers[index-1].author;
      question.answers[index-1].author_ID=author.id;
    }

    return authors[old_id];
  });

  //question
  let ID=question_ID;
  questions[question_ID++]=question;

  //answers
  [question.bestAnswer, ...question.answers].forEach(answer=>{
    answer.id=answer_ID;
    answer.question_ID=ID;
    answers[answer_ID++]=answer;
  });
});

(async()=>{
  function dataJoin(...args){
    return "('"+args.map(item=>{
      item=item||'';
      item=item.toString().replace(/'/g, '\\\'');

      return item;
    }).join("','")+"')";
  }

  //topics
  let aTopics=[];
  for(let title in topics){
    let ID=topics[title];

    aTopics.push(dataJoin(ID, title));
  }
  let topic_sql=`INSERT INTO topic_table VALUES${aTopics.join(',')}`;

  //authors
  let aAuthors=[];
  for(let id in authors){
    let author=authors[id];
    
    if(author.followerCount==''){
      author.followerCount = -1;
    }
    if(author.gender==''){
      author.gender = -1;
    }
    aAuthors.push(dataJoin(author.id, author.type, author.name, author.gender, author.userType, author.img_url, author.headline, author.followerCount));
  }
  let author_sql=`INSERT INTO author_table VALUES${aAuthors.join(',')}`;
  

  //questions
  let aQuestions=[];
  for(let ID in questions){
    let question=questions[ID];
    //console.log(`id`,ID);
    aQuestions.push(dataJoin(ID, question.title, question.question_content, question.topics, question.attention_count, question.view_count, question.bestAnswer.id));
  }
  
  let question_sql=`INSERT INTO question_table VALUES${aQuestions.join(',')}`;
  //console.log(question_sql);
  //answers
  let aAnswers=[];
  for(let ID in answers){
    let answer=answers[ID];

    aAnswers.push(dataJoin(ID, answer.question_ID, answer.author_ID, answer.content, answer.createdTime));
  }
  let answer_sql=`INSERT INTO answer_table VALUES${aAnswers.join(',')}`;

  //topic_sql
  await db.startTransaction();
  await db.executeTransaction(topic_sql);
  await db.executeTransaction(author_sql);
  await db.executeTransaction(question_sql);
  await db.executeTransaction(answer_sql);
  await db.stopTransaction();

  console.log('完成');
})();
上一篇下一篇

猜你喜欢

热点阅读