Chrome阅读模式

2018-08-17  本文已影响0人  蒂卡波牧羊犬

提取特征:
extract_features.js会根据urls提取html特征以及chrome distilled特征,分别为name.feature以及name.dfeature
features:

 'opengraph': hasOGArticle(),
 'url': document.location.href,
 'title': document.title,
 'numElements': body.querySelectorAll('*').length,
 'numAnchors': body.querySelectorAll('a').length,
 'numForms': body.querySelectorAll('form').length,
 'numTextInput': body.querySelectorAll('input[type="text"]').length,
 'numPasswordInput': body.querySelectorAll('input[type="password"]').length,
 'numPPRE': body.querySelectorAll('p,pre').length,
 'innerText': body.innerText,
 'textContent': body.textContent,
 'innerHTML': body.innerHTML,
 'mozScore': Math.min(6 * Math.sqrt(1000 - 140), _mozScore(false, 0.5, 140, true, 1000)),
 'mozScoreAllSqrt': Math.min(6 * Math.sqrt(1000), _mozScore(false, 0.5, 0, true, 1000)),
 'mozScoreAllLinear': Math.min(6 * 1000, _mozScore(false, 1, 0, true, 1000)),
 'visibleElements': countVisible(body.querySelectorAll('*')),
 'visiblePPRE': countVisible(body.querySelectorAll('p,pre')),

native:

derived features:

  features = [
    'id', index,
    'sin', math.sin(index),
    'openGraph', opengraph,

    'forum', 'forum' in path,
    'index', 'index' in path,
    'search', 'search' in path,
    'view', 'view' in path,
    'archive', 'archive' in path,
    'asp', '.asp' in path,
    'phpbb', 'phpbb' in path,
    'php', path.endswith('.php'),
    'pathLength', len(path),
    'domain', len(path) < 2,
    'pathComponents', CountMatches(path, r'\/.'),
    'slugDetector', CountMatches(path, r'[^\w/]'),
    'pathNumbers', CountMatches(path, r'\d+'),
    'lastSegmentLength', len(GetLastSegment(path)),

    'visibleRatio', float(visibleElements) / max(1, numElements),
    'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE),
    'PPRERatio', float(numPPRE) / max(1, numElements),
    'anchorPPRERatio', float(numAnchors) / max(1, numPPRE),

    'innerTextLength', len(innerText),
    'textContentLength', len(textContent),
    'innerHtmlLength', len(innerHTML),
    'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)),
    'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)),
    'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textContent)),

    'innerTextWordCount', innerTextWords,
    'textContentWordCount', textContentWords,
    'innerhtmlWordCount', innerHTMLWords,
    'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords),
    'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords),
    'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textContentWords),

    'textCount', numText,
    'passwordCount', numPassword,
    'formCount', numForms,
    'anchorCount', numAnchors,
    'elementCount', numElements,
    'anchorRatio', float(numAnchors) / max(1, numElements),
  ]

  for k in sorted(raw):
    if 'mozScore' in k or 'num' in k:
      features += [k, raw[k]]

mozScore

  function _mozScore(trim, power, cut, excludeLi, saturate) {
    var score = 0;

    var nodes = document.querySelectorAll('p,pre')
    for (var i = 0; i < nodes.length; i++) {
      var node = nodes[i];
      if (!isVisible(node)) {
        continue;
      }
      var matchString = node.className + " " + node.id;
      if (unlikelyCandidates.test(matchString) &&
           !okMaybeItsACandidate.test(matchString)) {
        continue;
      }

      if (excludeLi && node.matches && node.matches("li p")) {
        continue;
      }

      var textContent = node.textContent;
      if (trim) textContent = textContent.trim();
      var textContentLength = textContent.length;
      textContentLength = Math.min(saturate, textContentLength)
      if (textContentLength < cut) {
        continue;
      }

      score += Math.pow(textContentLength - cut, power);
    }
    return score;
  }

分类算法

OG_ARTICLE
meta是否包括og:type

AdaBoost
原理:https://blog.csdn.net/v_JULY_v/article/details/40718799

AdaBoost
上一篇下一篇

猜你喜欢

热点阅读