php 在cli模式下使用curl模拟登录保存cookie,爬取
2023-06-29 本文已影响0人
阿然学编程
<?php
error_reporting(E_ALL);
ini_set('display_errors', 1);
//浏览器友好输出
function dump($var)
{
ob_start();
var_dump($var);
$output = ob_get_clean();
$output = preg_replace('/\]\=\>\n(\s+)/m', '] => ', $output);
if (PHP_SAPI !== 'cli') {
if (!extension_loaded('xdebug')) {
$output = htmlspecialchars($output, ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8');
}
$output = '<pre>' . $output . '</pre>';
} else {
$output = PHP_EOL . $output;
}
echo $output;
//exit();
}
function http_curl($url, $data = null, $method = 'get', $token = '', $cookieName = '')
{
if (is_array($data)) {
$data = json_encode($data, 320);
}
$ch = curl_init(); // 初始化 cURL 资源
// 设置 URL
curl_setopt($ch, CURLOPT_URL, $url);
// 设置请求方式和数据
if ($method == 'post') {
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
}
// 设置请求头部信息
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Content-Type: application/json; charset=utf-8',
'Content-Length:' . strlen($data),
'token: ' . $token // 请求头携带的token
));
// 设置是否将结果保存到变量而不是直接输出
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// 设置是否跳过证书验证
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
// 设置连接和读取超时时间
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
// 设置保存和读取 Cookie
if ($cookieName) {
$cookie_file = dirname(__FILE__) . '/' . $cookieName . '_cookie.txt';
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
}
$result = curl_exec($ch); // 执行请求
curl_close($ch); // 关闭 cURL 资源
return json_decode($result, true);
}
function simulateLogin()
{
//获取用户名和密码
echo "请输入用户名:";
$username = trim(fgets(STDIN));
var_dump($username) . "\n";
echo "请输入密码:";
$password = trim(fgets(STDIN));
var_dump($password) . "\n";
// 获取验证码
// 这里假设验证码是一个四位数
// echo "请输入验证码(四位数):";
// $captcha = trim(fgets(STDIN));
// echo $captcha."\n";
$login_url = "https://www.test.com/api/login";
$data = [
'phone' => $username,
'password' => $password
];
$res = http_curl($login_url, $data, 'post');
if ($res && $res['code'] == 0) {
echo "登录成功!" . "\n";
echo "抓取数据,请稍等..." . "\n";
//抓取数据
$data = get_data($res['data']['token']);
dump($data);
//循环插入数据库动作
} else {
var_dump($res['data']);
}
}
//爬取数据
function get_data($token)
{
//请求url
$url = "https://www.test.com/api/dt/rlist";
//$token = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9374k0wGubr8ZThq_svNDJg';
//第一次请求获取第一页数据和总页码
$res = http_curl($url, '', 'post', $token);
$last_page = $res['data']['last_page'];//获取总页数
$allData = $res['data']['data'];//获取的第一页数据
//获取到总页数后,从第二页循环获取每页数据
for ($page = 2; $page <= $last_page; ++$page) {
$data = [
'page' => $page
];
$res = http_curl($url, $data, 'post', $token);
//合并数据
$allData = array_merge($allData, $res['data']['data']);
//dump($allData);
// 插入数据库的动作
//foreach ($allData as $item) {
// 可以将$item中的数据插入到数据库中
// insertDataToDatabase($item);
//}
}
return $allData;
}
//调用
simulateLogin();
?>
image.png