一段采集视频地址程序笔记!

2021-09-11  本文已影响0人  DragonersLi

未建索引,查询数据库不重复的数据总数

#循环数据组合成一条sql执行
foreach($data as $k=>$v){
  $title  = $v->title;
  $url = $v->url;
  $type = $v->type;
  $values[] = " ('{$title}','{$url}','{$type}') ";   
}
 $values_string = implode(',', $values);
 $sql = " INSERT INTO `{$table}` (`title`,`url`,`type`) VALUES {$values_string} ";  
 

 
select  count(DISTINCT url)  from `table`    #查询不重复总数

建立索引避免重复数据


CREATE TABLE `table` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(50) CHARACTER SET utf8mb4 NOT NULL DEFAULT '' COMMENT '视频标题',
  `intro` varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '视频简介',
  `url` varchar(200) CHARACTER SET utf8mb4 NOT NULL DEFAULT '' COMMENT '链接地址',
  `type` varchar(20) CHARACTER SET utf8mb4 NOT NULL DEFAULT '' COMMENT '播放类型',
  `pid` tinyint(1) unsigned NOT NULL DEFAULT '0' COMMENT '视频分类【0:flash;1:fuli;2:ll】',
  `status` tinyint(1) unsigned NOT NULL DEFAULT '1' COMMENT '状态【0:禁用;1:正常】',
  `sort` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '排序(值越大越靠前)',
  `created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '创建时间',
  PRIMARY KEY (`id`),
  UNIQUE KEY `unique_url` (`url`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci



<?php

header("Content-type: text/html; charset=gb2312"); 
set_time_limit(0); //解除PHP脚本时间30s限制 
ini_set('date.timezone', 'Asia/Shanghai');
ini_set('memory_limit','256M');//修改内存值 
ignore_user_abort(true); //无论客户端是否关闭浏览器都执行代码 


$time_start = microtime(true);//开始 
$baseUrl = 'http://n.haha.com';//网站主页
$table = 'table_fuli';//数据表和导出文件名
$arr = [0=>'flash',1=>'fuli',2=>'ll'];
$select = 1;//选择采集的分类
$caturl = $arr[$select];//采集的分类
$start = 0;//采集开始页
$end = 800;//采集结束页

 
 
for($i=$start;$i<$end;$i++){
    $ext = ($i>0) ? $i : '';//分页后缀
    $url = "{$baseUrl}/{$caturl}/index{$ext}.html";//每页链接地址
    $con = getContent($url); 
    
    #preg_match_all('#<a href="(.*)" class="playlink" target="_blank">#iU',$con,$result);
    preg_match_all('#<h2><a href="(.*)" target="_blank">(.*)</a></h2>#iU',$con,$result);
    unset($con); 
    if(!empty($result[1])){ //匹配结果不为空
        foreach($result[1] as $k=>$v){ 
            $playlink = $baseUrl.$v;//获取内页地址
            $res = getContent($playlink);//获取内页地址和视频标题
            
            preg_match_all('# <div class="playlist">([\w\W]+)(.*)([\w\W]+)</div>([\w\W]+)<!--(.*) End-->#iU',$res,$data);//获取type,urls
            
            unset($res); 
            if(!empty($data[3])){
                foreach($data[3] as $k1=>$playlinks){ 
                 
                    preg_match_all('#<a title=\'(.*)\' href=\'(.*)\' target="_blank">#iU',$playlinks,$links); //获取url,intro
                    
                    unset($playlinks); 
                    if(!empty($links[2])){
                        foreach($links[2] as $k2=>$playurl){
                            
                            $title = $result[2][$k];//视频标题
                            $intro = $links[1][$k2];//视频简介
                            $url = $baseUrl.$playurl;//视频地址
                            $type = $data[5][$k1];//播放类型
                            $sql = "INSERT INTO `{$table}` (title,intro,url,type,pid) VALUES ('{$title}','{$intro}','{$url}','{$type}',{$select}); \r"; //逐条生成sql语句
                            file_put_contents("./{$table}.txt",$sql,FILE_APPEND); 
                        }
                    }
                    
                }
                
            } 
            
         
        }
    } 
    
    sleep(1);//休息1秒
    
} 
 
$time_end = microtime(true); //结束   
$time = sprintf("#Success! time used: %ss",round($time_end-$time_start,6)); //输出运行总时间
echo $time;
file_put_contents("./{$table}.txt",$time,FILE_APPEND); 
 

 
function getContent($url){
    
    $opts = [
        'http'=>['method'=>"GET",'timeout' =>10,'header'=>"Accept-language: en\r\n","Cookie: foo=bar\r\n"]
    ]; 
    $context = stream_context_create($opts); 
    return file_get_contents($url, false, $context);  
 
}




上一篇 下一篇

猜你喜欢

热点阅读