一段采集视频地址程序笔记!
2021-09-11 本文已影响0人
DragonersLi
未建索引,查询数据库不重复的数据总数
#循环数据组合成一条sql执行
foreach($data as $k=>$v){
$title = $v->title;
$url = $v->url;
$type = $v->type;
$values[] = " ('{$title}','{$url}','{$type}') ";
}
$values_string = implode(',', $values);
$sql = " INSERT INTO `{$table}` (`title`,`url`,`type`) VALUES {$values_string} ";
select count(DISTINCT url) from `table` #查询不重复总数
建立索引避免重复数据
CREATE TABLE `table` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(50) CHARACTER SET utf8mb4 NOT NULL DEFAULT '' COMMENT '视频标题',
`intro` varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '视频简介',
`url` varchar(200) CHARACTER SET utf8mb4 NOT NULL DEFAULT '' COMMENT '链接地址',
`type` varchar(20) CHARACTER SET utf8mb4 NOT NULL DEFAULT '' COMMENT '播放类型',
`pid` tinyint(1) unsigned NOT NULL DEFAULT '0' COMMENT '视频分类【0:flash;1:fuli;2:ll】',
`status` tinyint(1) unsigned NOT NULL DEFAULT '1' COMMENT '状态【0:禁用;1:正常】',
`sort` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '排序(值越大越靠前)',
`created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '创建时间',
PRIMARY KEY (`id`),
UNIQUE KEY `unique_url` (`url`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
<?php
header("Content-type: text/html; charset=gb2312");
set_time_limit(0); //解除PHP脚本时间30s限制
ini_set('date.timezone', 'Asia/Shanghai');
ini_set('memory_limit','256M');//修改内存值
ignore_user_abort(true); //无论客户端是否关闭浏览器都执行代码
$time_start = microtime(true);//开始
$baseUrl = 'http://n.haha.com';//网站主页
$table = 'table_fuli';//数据表和导出文件名
$arr = [0=>'flash',1=>'fuli',2=>'ll'];
$select = 1;//选择采集的分类
$caturl = $arr[$select];//采集的分类
$start = 0;//采集开始页
$end = 800;//采集结束页
for($i=$start;$i<$end;$i++){
$ext = ($i>0) ? $i : '';//分页后缀
$url = "{$baseUrl}/{$caturl}/index{$ext}.html";//每页链接地址
$con = getContent($url);
#preg_match_all('#<a href="(.*)" class="playlink" target="_blank">#iU',$con,$result);
preg_match_all('#<h2><a href="(.*)" target="_blank">(.*)</a></h2>#iU',$con,$result);
unset($con);
if(!empty($result[1])){ //匹配结果不为空
foreach($result[1] as $k=>$v){
$playlink = $baseUrl.$v;//获取内页地址
$res = getContent($playlink);//获取内页地址和视频标题
preg_match_all('# <div class="playlist">([\w\W]+)(.*)([\w\W]+)</div>([\w\W]+)<!--(.*) End-->#iU',$res,$data);//获取type,urls
unset($res);
if(!empty($data[3])){
foreach($data[3] as $k1=>$playlinks){
preg_match_all('#<a title=\'(.*)\' href=\'(.*)\' target="_blank">#iU',$playlinks,$links); //获取url,intro
unset($playlinks);
if(!empty($links[2])){
foreach($links[2] as $k2=>$playurl){
$title = $result[2][$k];//视频标题
$intro = $links[1][$k2];//视频简介
$url = $baseUrl.$playurl;//视频地址
$type = $data[5][$k1];//播放类型
$sql = "INSERT INTO `{$table}` (title,intro,url,type,pid) VALUES ('{$title}','{$intro}','{$url}','{$type}',{$select}); \r"; //逐条生成sql语句
file_put_contents("./{$table}.txt",$sql,FILE_APPEND);
}
}
}
}
}
}
sleep(1);//休息1秒
}
$time_end = microtime(true); //结束
$time = sprintf("#Success! time used: %ss",round($time_end-$time_start,6)); //输出运行总时间
echo $time;
file_put_contents("./{$table}.txt",$time,FILE_APPEND);
function getContent($url){
$opts = [
'http'=>['method'=>"GET",'timeout' =>10,'header'=>"Accept-language: en\r\n","Cookie: foo=bar\r\n"]
];
$context = stream_context_create($opts);
return file_get_contents($url, false, $context);
}