node:提取网页中的地址地理编码并转json

2021-09-06  本文已影响0人  春暖花已开
index.js
const http = require('http')
const path = require('path')
const fs = require('fs')
const cheerio = require('cheerio')

const { getLocalDatasForLnglat, createDirSync } = require('./getLnglat')

/* 
  url: 请求的网址
  docParseDst: 存储的路径(html为`${docParseDst}.html`, 提取的数据存储在`${docParseDst}.json`)
  selectorPath: 查找的路径
  findSelector: 要查找的selector
  lnglatDst: 最终解析好的json存放路径
 */
const getDataAndSave = ({ url, docParseDst, selectorPath, findSelector, lnglatDst }) => {
  const absoluteDocParseDst = path.resolve(__dirname, docParseDst)

  if (createDirSync(path.dirname(absoluteDocParseDst))) {
    const aHtml = `${absoluteDocParseDst}.html`
    const aJson = `${absoluteDocParseDst}.json`
    // 创建写入流
    const writerStream = fs.createWriteStream(
      aHtml,
      /* 以下两个都是默认属性, 可以不写 */
      { flags: 'w', autoClose: true }
    )

    const req = http
      .get(url, res => {
        res.setEncoding('utf-8')
        res.on('data', chunk => {
          // 分段写入
          writerStream.write(chunk)
          console.log('请求')
        })
      })
      .on('error', e => {
        console.log('request-err', e.message)
      })
      .on('close', () => {
        // 关闭流
        writerStream.close()

        console.log('开始解析')

        // 读取html
        fs.readFile(aHtml, { encoding: 'utf-8' }, (readError, datas) => {
          const $ = cheerio.load(datas)
          const elements = $(selectorPath).find(findSelector)
          const length = elements.length
          if (length > 0) {
            const result = []
            elements.each((i, item) => {
              result[i] = $(item).text().trim()
            })

            fs.writeFile(aJson, JSON.stringify(result), err => {
              if (err) {
                console.log(`fs-err`, err)
              } else {
                getLocalDatasForLnglat(aJson, lnglatDst)
              }
            })
          }
        })
      })
    req.end()
  }
}

getDataAndSave({
  url: 'http://www.lg.gov.cn/zwfw/zdfw/yl/fwsm/yljg/sqjkfwzx/',
  docParseDst: './datas/社康',
  selectorPath: '.sk-box .sk-item',
  findSelector: 'a',
  lnglatDst: './parser/smt.lg.map.nav.json'
})
getLnglat.js
const http = require('https')
const path = require('path')
const fs = require('fs')

const results = []

// 读取本地json, 遍历发送网络请求
const getLocalDatasForLnglat = (src, dst) => {
  if (!fs.existsSync(src)) {
    console.error('未找到对应路径的文件')
    return
  }

  fs.readFile(src, { encoding: 'utf-8' }, (err, datas) => {
    if (err) {
      console.error('读取本地文件出错: ', err)
    } else {
      const parseDatas = JSON.parse(datas)
      for (const addr of parseDatas) {
        getLnglatFromAmap(addr, dst)
      }
    }
  })
}

// 从高德地图拿经纬度
const getLnglatFromAmap = (addr, dst) => {
  const url = `https://restapi.amap.com/v3/geocode/geo?key=ead4b4ffc3093ac65bf76055625e47a6&s=rsv3&city=0755&address=${addr}`
  const req = http
    .get(url, res => {
      res.on('data', chunk => {
        const {
          geocodes: [{ formatted_address, location }]
        } = JSON.parse(chunk.toString())
        const [lng, lat] = location.split(',')
        results.push({
          name: formatted_address.replace(/^(广东省?)?(深圳市?)?/, ''),
          lat,
          lng
        })

        if (!this.debounce) {
          this.debounce = debounce(() => writeToFile(dst, JSON.stringify(results)), 1000)
        }
        this.debounce()
      })
    })
    .on('error', e => {
      console.log('request-err', e.message)
    })
    .on('close', () => { })
  req.end()
}

// 根据要存储的路径,递归创建文件夹
const createDirSync = pathName => {
  if (fs.existsSync(pathName)) {
    return true
  } else {
    if (createDirSync(path.dirname(pathName))) {
      fs.mkdirSync(pathName)
      return true
    }
  }
}

// 写入到指定路径的文件
const writeToFile = (dstPath, content) => {
  if (createDirSync(path.dirname(dstPath))) {
    fs.writeFile(dstPath, content, err => {
      if (err) {
        console.log(`写入本地报错: `, err)
      } else {
        console.log('地理编码成功')
      }
    })
  }
}

// 防抖
const debounce = (func, delay) => {
  let timer = null
  return function (...args) {
    if (timer) clearTimeout(timer)
    timer = setTimeout(() => {
      func.apply(this, args)
    }, delay)
  }
}

module.exports = {
  createDirSync,
  getLocalDatasForLnglat
}
上一篇 下一篇

猜你喜欢

热点阅读