python爬携程酒店评论,携程酒店评论爬虫
2018-11-01 本文已影响214人
sexy_cyber
新增此酒店无点评的过滤,有问题欢迎留言
# -*- coding: utf-8 -*-
import scrapy
import PyV8
import csv
import re
import math
import time
import redis
import requests
import hashlib
import random
uas=["Mozilla/5.0 (Linux; U; Android 5.0.2; zh-CN; Letv X501 Build/DBXCNOP5501304131S) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/10.10.0.800 U3/0.8.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; U; Android 5.0.2; zh-cn; Letv X501 Build/DBXCNOP5501304131S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/6.7 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 5.1.1; vivo X6S A Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3 baiduboxapp/7.3.1 (Baidu; P1 5.1.1)",
"Mozilla/5.0 (Linux; U; Android 4.3; zh-cn; N5117 Build/JLS36C) AppleWebKit/534.24 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.24 T5/2.0 baiduboxapp/7.0 (Baidu; P1 4.3)",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_2_1 like Mac OS X; zh-CN) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/13D15 UCBrowser/10.9.15.793 Mobile",
"Mozilla/5.0 (iPhone 6p; CPU iPhone OS 9_2_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/6.0 MQQBrowser/6.7 Mobile/13D15 Safari/8536.25 MttCustomUA/2",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_2_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13D15 Safari/601.1",
"Mozilla/5.0 (Linux; U; Android 4.1.2; zh-cn; GT-S7572 Build/JZO54K) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/6.7 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 5.1.1; zh-cn; SM-J3109 Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/6.6 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 4.4.4; zh-cn; Coolpad 8297-T01 Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/6.6 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 5.1.1; zh-CN; MX4 Pro Build/LMY48W) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/10.10.0.800 U3/0.8.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; Android 5.1; m2 note Build/LMY47D) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.114 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 5.1; zh-CN; m2 note Build/LMY47D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/10.9.10.788 U3/0.8.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; U; Android 5.1; zh-cn; m2 note Build/LMY47D) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/6.6 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 4.4.4; zh-cn; CHM-CL00 Build/CHM-CL00) AppleWebKit/534.24 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.24 T5/2.0 baiduboxapp/7.1 (Baidu; P1 4.4.4)",
"Mozilla/5.0 (Linux; Android 5.0.1; HUAWEI GRA-TL00 Build/HUAWEIGRA-TL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile Safari/537.36 MxBrowser/4.5.9.3000",
"Mozilla/5.0 (Linux; Android 5.0.1; HUAWEI GRA-CL00 Build/HUAWEIGRA-CL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3 baiduboxapp/7.3.1 (Baidu; P1 5.0.1)",
"Mozilla/5.0 (Linux; Android 5.0.2; Redmi Note 2 Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3 baiduboxapp/7.3.1 (Baidu; P1 5.0.2)",
"Mozilla/5.0 (Linux; Android 4.4.4; Che1-CL10 Build/Che1-CL10) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3 baiduboxapp/7.3.1 (Baidu; P1 4.4.4)",
"Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; HUAWEI P6-C00 Build/HuaweiP6-C00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/6.7 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.3; R7007 Build/JLS36C) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3 baiduboxapp/7.3.1 (Baidu; P1 4.3)",
"Mozilla/5.0 (Linux; Android 5.1.1; KIW-CL00 Build/HONORKIW-CL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/7.1 baidubrowser/7.1.12.0 (Baidu; P1 5.1.1)",
"Mozilla/5.0 (Linux; Android 5.1.1; MX4 Pro Build/LMY48W) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036215 Safari/537.36 V1_AND_SQ_6.3.7_374_YYB_D PA QQ/6.3.7.2795 NetType/WIFI WebP/0.3.0 Pixel/1536",
"Mozilla/5.0 (Linux; Android 5.1; m2 note Build/LMY47D) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036222 Safari/537.36 V1_AND_SQ_6.3.3_358_YYB_D QQ/6.3.3.2755 NetType/WIFI WebP/0.3.0 Pixel/1080",
"Mozilla/5.0 (Linux; Android 4.4.4; CHM-CL00 Build/CHM-CL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36 V1_AND_SQ_6.3.7_374_YYB_D QQ/6.3.7.2795 NetType/WIFI WebP/0.3.2 Pixel/720",
"Mozilla/5.0 (Linux; Android 5.0.1; HUAWEI GRA-TL00 Build/HUAWEIGRA-TL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036215 Safari/537.36 MicroMessenger/6.3.16.49_r03ae324.780 NetType/WIFI Language/zh_CN",
"Mozilla/5.0 (Linux; Android 5.0.1; HUAWEI GRA-CL00 Build/HUAWEIGRA-CL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036519 Safari/537.36 V1_AND_SQ_6.3.1_350_YYB_D QQ/6.3.1.2735 NetType/WIFI WebP/0.3.0 Pixel/1080",
"Mozilla/5.0 (Linux; Android 4.4.4; Che1-CL10 Build/Che1-CL10) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036222 Safari/537.36 V1_AND_SQ_6.3.0_348_YYB_D QQ/6.3.0.2730 NetType/WIFI WebP/0.3.0 Pixel/720",
"Mozilla/5.0 (Linux; Android 5.1.1; KIW-CL00 Build/HONORKIW-CL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 V1_AND_SQ_6.2.3_336_YYB_D QQ/6.2.3.2700 NetType/WIFI WebP/0.4.1 Pixel/1080",
"Mozilla/5.0 (Linux; Android 4.4.2; HUAWEI P6-C00 Build/HuaweiP6-C00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036518 Safari/537.36 V1_AND_SQ_6.3.1_350_YYB_D QQ/6.3.1.2735 NetType/WIFI WebP/0.3.0 Pixel/720",
"Mozilla/5.0 (Linux; Android 5.0.2; Letv X501 Build/DBXCNOP5501304131S) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036519 Safari/537.36 V1_AND_SQ_6.3.7_374_YYB_D QQ/6.3.7.2795 NetType/WIFI WebP/0.3.0 Pixel/1080",
"Mozilla/5.0 (Linux; Android 5.1.1; vivo X6S A Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036222 Safari/537.36 V1_AND_SQ_6.3.3_358_YYB_D QQ/6.3.3.2755 NetType/WIFI WebP/0.3.0 Pixel/1080",
"Mozilla/5.0 (Linux; U; Android 4.3; zh-cn; N5117 Build/JLS36C) AppleWebKit/533.1 (KHTML, like Gecko)Version/4.0 MQQBrowser/5.4 TBS/025489 Mobile Safari/533.1 V1_AND_SQ_6.3.7_374_YYB_D QQ/6.3.7.2795 NetType/WIFI WebP/0.3.0 Pixel/720",
"Mozilla/5.0 (Linux; Android 4.3; R7007 Build/JLS36C) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036518 Safari/537.36 V1_AND_SQ_6.3.7_374_YYB_D QQ/6.3.7.2795 NetType/WIFI WebP/0.3.0 Pixel/720",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_0_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13A404 QQ/6.3.3.432 V1_IPH_SQ_6.3.3_1_APP_A Pixel/640 Core/UIWebView NetType/WIFI Mem/10",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_2_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13D15 QQ/6.3.3.432 V1_IPH_SQ_6.3.3_1_APP_A Pixel/1080 Core/UIWebView NetType/WIFI Mem/104",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13C75 QQ/6.2.3.409 Pixel/750 NetType/WIFI Mem/703",
"Mozilla/5.0 (Linux; U; Android 4.4.4; zh-cn; Coolpad 8297-T01 Build/KTU84P) AppleWebKit/533.1 (KHTML, like Gecko)Version/4.0 MQQBrowser/5.4 TBS/025477 Mobile Safari/533.1 V1_AND_SQ_5.9.0_270_YYB_D QQ/5.9.0.2530 NetType/WIFI WebP/0.3.0 Pixel/720",
"Mozilla/5.0 (Linux; Android 4.1.2; GT-S7572 Build/JZO54K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036519 Safari/537.36 V1_AND_SQ_6.3.3_358_YYB_D QQ/6.3.3.2755 NetType/WIFI WebP/0.3.0 Pixel/480",
"Mozilla/5.0 (Linux; Android 5.1.1; SM-J3109 Build/LMY47X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.121 Mobile Safari/537.36 V1_AND_SQ_6.3.6_372_YYB_D QQ/6.3.6.2790 NetType/WIFI WebP/0.4.1 Pixel/720",
"Mozilla/5.0 (Linux; Android 4.4.4; HM 2A Build/KTU84Q) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036215 Safari/537.36 V1_AND_SQ_6.3.1_350_YYB_D QQ/6.3.1.2735 NetType/4G WebP/0.3.0 Pixel/720"]
proxy = 'http://forward.xdaili.cn:80'
redis_cli = redis.Redis(host='127.0.0.1',port=6379)
# total=0
old=['3627729', '2118053', '6819061', '2298484', '2172706', '1876428', '1589077', '11380770', '1958774', '6842574', '1759484', '1086305', '4381334', '1481138', '837717', '1204038', '2360469', '1211487', '13734060', '1339925', '1425259', '13914734', '11214151', '1406213', '2297873', '11908627', '2132052', '458454', '2202365', '5918160', '1961888', '900093', '6863196', '1889452', '2300884', '6796357', '4638183', '12071566', '6731410', '1003237', '1230262', '856746', '17450212', '1098676', '5902236', '2303449', '2304848', '1641372', '6832630', '1902963', '1690274', '5489745', '18517273', '19649742', '1958350', '1351685', '1440921', '426994', '1737616', '1850267', '1213467', '1063955', '6836738', '6743779', '910182', '1684318', '3906182', '2305225', '6056302', '1051548', '2316765', '1541864', '1622779', '4533099', '1361685', '1961356', '6868084', '1541722', '1685719', '3732967', '6841828', '1909064', '2134802', '1977131', '1320141', '6800690', '1401639', '6864121', '17540271', '2338661', '2071002', '1000280', '11304544', '18437686', '6822720', '1057158', '2299473', '19725312', '1372418', '3658292', '2304165', '2226461', '5377003', '19480114', '1207371', '1461331', '1509564', '1718137', '3183635', '1883192', '1702093', '1033589', '1369529', '2326388', '1183467', '897902', '5318859', '6447584', '5620340', '1871846', '2275937', '18410678', '8554137', '1440677', '9608813', '4026013', '1952183', '1002883', '1971555', '5559389', '19492379', '6868003', '15852978', '2569678', '3090319', '5305869', '789681', '4422716', '1530265', '2907898', '1922766', '1318891', '6799625', '808314', '17485198', '19588495', '2578930', '4545488', '2128340', '1862369', '1243463', '2201041', '7388277', '1298667', '772911', '835663', '12824495', '8164434', '4536530', '4957915', '11305400', '6842321', '6793598', '1444719', '1504544', '5895556', '807599', '1251554', '1728563', '854024', '1433839', '8703926', '19573376', '1064712', '6800640', '1893626', '1386622', '6780254', '1804648', '11235005', '5253644', '1699226', '925466', '1615699', '1389740', '8956038', '1050075', '5981168', '6734645', '1995830', '3037916', '1721427', '18532918', '1619888', '1293971', '1613699', '1912573', '8899297', '1785554', '2879977', '1106164', '1772086', '2697957', '781417', '1417314', '2310181', '13350853', '9649319', '1782553', '1410240', '4061770', '1806911', '1031098', '908062', '5994269', '4695161', '2361906', '1536549', '1904263', '1003886', '13994552', '1498677', '6836817', '1679292', '6789056', '1416817', '1771571', '833235', '801467', '6741064', '12495955', '1778602', '2098708', '19581327', '2251644', '17900460', '10919182', '932780', '817651', '1742228', '2598403', '6982369', '2307204', '5208128', '15883026', '4820960', '2299915', '2304443', '424783', '827399', '1259271', '8682776', '2296723', '6869005', '16012196', '2636737', '2055655', '975585', '11242209', '1643477', '4498218', '1343986', '1119837', '2368580', '6824000', '5798750', '2313530', '2390100', '1300692', '6955100', '5128180', '3489839', '1283376', '4662024', '2164332', '6851365', '6420173', '436885', '789903', '17525164', '1565995', '2266858', '4826391', '4836470', '3717280', '6809293', '799216', '5660950', '2057506', '2917051', '437211', '12674014', '12432915', '6237664', '2079683', '8617592', '2299701', '1541071', '2124267', '6719075', '1664819', '1769801', '8589075', '2702330', '1614821', '19842670', '1815927', '3045403', '3462996', '779556', '1896552', '1430994', '3243870', '4699014', '6815195', '1501823', '17140272', '506114', '2626537', '17312486', '8515070', '10919592', '15898299', '1937749', '2270930', '1622760', '19626947', '1202072', '1875588', '15535237', '2115507', '5952250', '9615273', '1282155', '17248119', '19615593', '2925997', '6774897', '1995775', '810166', '18473844', '1355855', '1010947', '15950451', '2000674', '3784210', '5464502', '17936228', '1472425', '1061140', '17500345', '8499950', '6830359', '16482272', '8971551', '1471532', '2339350', '970013', '4817553', '2304510', '1461184', '1726601', '1719656', '11809730', '5892409', '1614065', '2029829', '833537', '1337616', '1839241', '2331793', '839348', '1280933', '1827447', '813791', '15958021', '1644328', '2496479', '1699216', '2310563', '10896773', '4978300', '14256752', '1102034', '2567461', '5380683', '4509755', '2364144', '2384525', '2082730', '2118242', '3067094', '6026507', '15879548', '2313410', '2372869', '2728162', '2300393', '3662772', '2260030', '1503178', '1539974', '987066', '1315268', '16105979', '5804593', '1122006', '5908318', '1096283', '2327445', '8719330', '1350409', '1517922', '1526549', '3029011', '6867110', '1302236', '2313207', '443637', '1406429', '895293', '8122228', '3363872', '3731868', '15120966', '3184096', '2575689', '982235', '1624670', '2307498', '16078771', '1733770', '1361945', '3628017', '6751984', '1093109', '910602', '1734108', '8501503', '2614947', '2220838', '6780926', '1994442', '1577450', '1935049', '2313551', '2306038', '954175', '9194330', '1539952', '1905845', '15305684', '8111913', '1926067', '915173', '837139', '1478386', '1408778', '2302426', '1896498', '1357885', '1618279', '1252021', '1900932', '15056164', '6866597', '1577403', '2637370', '1098048', '1570951', '1512316', '979354', '1892712', '9530920', '16926040', '9615277', '446058', '4330650', '4500646', '14175703', '14552657', '19451247', '1740495', '2301605', '1617055', '6746002', '9349241', '894006', '2304821', '1316894', '976648', '2967568', '1880519', '1286710', '1628016', '1669459', '1666388', '1881821', '1500874', '812399', '2939323', '1801375', '6743939', '1012816', '1044796', '3289648', '1962084', '895154', '2307731', '896820', '1061124', '1847310', '2014179', '1623990', '838949', '17354610', '2245193', '16209730', '1351730', '8650112', '1691421', '2299085', '1005864', '1478046', '1366993', '6827985', '6006630', '8077236', '4012618', '826170', '820141', '1368596', '6843695', '11469990', '1839317', '1434296', '2355977', '1411662', '1644646', '18538852', '436420', '1725945', '1114448', '18438319', '10218353', '6191799', '5515704', '1780271', '1910620', '2981182', '996899', '8565876', '5903965', '2390185', '1610072', '1389848', '18070354', '19678378', '17501715', '1354870', '13959553', '2145766', '2619646', '2331438', '1978645', '2782017', '6804534', '1171698', '2266778', '827316', '8338387', '1682103', '1920638', '4690439', '2308932', '2307186', '1994125', '4030036', '1699182', '1925635', '16079469', '1909856', '1000673', '5382159', '1395780', '5704076', '1647968', '1483133', '17502422', '2916975', '2075755', '3358872', '1727701', '4652678', '835630', '16028446', '1767800', '2158242', '1466303', '764916', '1566929', '1920794', '4645339', '6793569', '1679598', '2082982', '971525', '1970708', '1331427', '6467417', '1887724', '11080357', '6738649', '1589197', '1425289', '4602163', '2541865', '1213032', '1425508', '1596950', '11475363', '2308125', '791767', '11075226', '801950', '2148544', '1639810', '1470101', '5159224', '2605823', '1582445', '6814717', '2238376', '1207720', '1397200', '6723078', '8647127', '17042433', '5523358', '1780214', '18068125', '2043260', '1814201', '913016', '19598709', '770150', '12254291', '916884', '1578908', '1441636', '5944697', '1633255', '15977191', '914934', '1916275', '8616562', '5953193', '6847240', '16132337', '1348364', '3799953', '1183576', '14766410', '5309947', '5378002', '2338371', '10201560', '11420966', '1136911', '5247165', '11467781', '11083458', '1362214', '6801449', '13607365', '6008144', '893696', '1759475', '8571421', '2191621', '2301016', '18078743', '6821435', '2079637', '7701842', '1736503', '8869678', '2042625', '6727645', '2626171', '5947940', '2375787', '13856396', '4615038', '2324921', '9242588', '2307662', '1684928', '8808260', '1959191', '6785397', '1737540', '1104401', '1959327', '15684503', '11215076', '11303442', '1299660', '5790947', '2327893', '13707206', '1208041', '1766894', '6794546', '19521283', '2304426', '1111044', '2306027', '2887756', '1686690', '2161185', '811339', '4632578', '19600530', '790172', '2200678', '1267352', '1342356', '15893501', '17905179', '3328287', '2176031', '802757', '1628289', '11122332', '759191', '1632099', '1728977', '13925627', '2306753', '432493', '2536833', '1608477', '19536332', '1574415', '15249094', '1355662', '11256727', '1609100', '1876812', '1900309', '1630845', '2169373', '6737990', '1511903', '15277222', '3271326', '5606898', '4340023', '1978638', '1981821', '1902459', '1789436', '6336840', '2613961', '779055', '2032548', '8669778', '1304792', '2574866', '813086', '2086497', '1786613', '19479396', '1529486', '1628173', '2300079', '16116909', '1798628', '2120515', '1708037', '1596368', '1376854', '13362245', '925099', '1100112', '12132647', '16004258', '3120468', '1458133', '1104245', '1849000', '1526840', '1726121', '7921948', '6790801', '4434046', '1786817', '1281465', '1609951', '1373920', '2165091', '2305213', '817809', '6335000', '3441542', '15168530', '4328258', '1456575', '8558610', '442317', '9601742', '2168924', '2157683', '1273899', '3669318', '1213130', '2930165', '2302012', '2603508', '4532627', '12638142', '1736279', '2073354', '1725966', '1794993', '18469216', '763353', '18036038', '16920326', '1014772', '9595855', '5242650', '922896', '1208915', '14074673', '1171449', '6186985', '898406', '1902836', '11302741', '1465117', '19682169', '19678032', '1848354', '9595430', '1725869', '1281935', '18560222', '1458569', '1294571', '4637967', '1695527', '1767852', '13991613', '6760627', '765320', '15066116', '4603904', '7850613', '10900563', '5980774', '859555', '15058548', '4535620', '6747247', '930694', '1204838', '9149477', '1513597', '15977534', '3797634', '8557512', '5898671', '1410576', '4425256', '1355589', '1200149', '6004315', '1814570', '1966109', '8685439', '2700617', '2301549', '12854039', '2350342', '3733392', '4844103', '1383203', '1003308', '1566037', '1953715', '1424122', '1246492', '2649450', '3181083', '1489054', '12253465', '1440851', '1485997', '6131351', '4662663', '1335023', '1734830', '2089239', '6868530', '17334886', '14121958', '6752647', '1719561', '3796864', '1272905', '18048405', '757223', '2600608', '1908436', '1937701', '1346484', '2324087', '980087', '12067834', '2638609', '1315506', '1199101', '19073083', '2966612', '6664026', '831997', '6749879', '14121740', '1498198', '903620', '796060', '1675612', '1900967', '6801413', '2637030', '1111106', '1775941', '1624294', '16132332', '1382115', '1002247', '6805573', '1485627', '2231664', '1993495', '1372722', '14214462', '806427', '839333', '2116230', '4375996', '2080741', '5175723', '4727431', '1183804', '1454587', '1284867', '1768439', '1093103', '1565747', '5176021', '764800', '2546338', '2207313', '6857516', '18409757', '770256', '2303460', '2169270', '745510', '3673072', '9457932', '6442639', '5494990', '2307936', '9896109', '6758059', '3139851', '1199258', '1231039', '1216584', '3639079', '1250175', '482671', '19677584', '1909498', '1776856', '6861873', '13987192', '815769', '5246677', '1867716', '434399', '2307753', '1597683', '1319027', '2524489', '2330565', '1632823', '1790120', '13437013', '8920338', '1111354', '899197', '1047873', '16899409', '4645533', '11243950', '2354694', '6796721', '2294412', '1438821', '2299860', '1840195', '3732175', '1807842', '2599547', '17195673', '13598388', '5381506', '1015302', '2636826', '844846', '12131521', '6778787', '17339014', '2309486', '5148395', '11306241', '8573127', '12879519', '4629981', '5918052', '7344772', '6872564', '19638358', '1873995', '3109539', '1597527', '6808029', '1937815', '19440655', '1501161', '1697742', '4614554', '2171642', '1212655', '4837488', '2205786', '1338722', '2304486', '2164438', '2301005', '827381', '5729044', '11265099', '993285', '1583106', '5809766', '6542608', '1328815', '2012838', '1427247', '4846284', '2296838', '2311159', '14153143', '9558222', '6859929', '6810418', '4439312', '1775687', '1429383', '2135353', '1687837', '1626559', '6721165', '987321', '5979817', '768904', '4068627', '1300087', '2325873', '6071131', '6528144', '2971636', '1435224', '5701662', '2568390', '6842506', '1891735', '9626438', '846919', '6160704', '1568404', '3742776', '12628163', '1410015', '2056315', '2599332', '4693075', '1597898', '6285099', '1511543', '1961052', '2097630', '1511718', '6829905', '5666877', '457195', '2469448', '15734018', '1541098', '1487963', '4340556', '5385292', '3292533', '2298608', '2478568', '6801045', '1504697', '1343246', '1611048', '8647374', '976493', '6665675', '17539931', '1512321', '5799567', '4509481', '1273222', '1722772', '1536965', '2240024', '6804845', '1293181', '11978648', '2221067', '1606408', '1404545', '1200180', '2639275', '5274866', '1246580', '987747', '2302025', '1504983', '7819371', '11101719', '6080805', '1922201', '8924704', '5053122', '2883397', '1268681', '1098832', '8568573', '6845816', '2275754', '4619105', '4964417', '17288587', '2013377', '4532784', '1290080', '1987062', '2987449', '1339031', '2299788', '17880772', '5549903', '1598256', '13957859', '1529412', '828336', '1911482', '1298016', '1273915', '2140711', '5337619', '3031966', '1064017', '1630416', '6867418', '2385312', '1972803', '439806', '1892548', '12679666', '18436565', '5638809', '5276821', '1049728', '11673355', '6156030', '1764493', '2035309', '1005227', '1692278', '6456830', '712965', '1714723', '5371888', '1652128', '481278', '6824473', '1652323', '811278', '8198072', '19537874', '13307128', '2216135', '1787148', '2730771', '1573299', '1766892', '2306370', '18067843', '2379383', '9470351', '1312528', '1516850', '1764280', '2100191', '4029598', '2290483', '2310742', '18543345', '4547786', '1933650', '6835041', '12084907', '6851212', '1770291', '6874163', '2308355', '15708968', '2857949', '6841880', '2308892', '1636766', '6778880', '8849816', '1848685', '6857869', '8568564', '4723764', '5968953', '5758479', '15698658', '16100391', '1472415', '1875550', '5273815', '1728622', '2333002', '1332985', '2307657', '6724408', '6839712', '1595864', '1719544', '1227936', '3642684', '4398288', '1633182', '5464141', '4964241', '2919620', '428738', '6637184', '7649420', '1393678', '1320954', '4540234', '1477326', '1745354', '2234677', '2300090', '1579479', '5386765', '11303940', '1434501', '1467034', '4964158', '912610', '1501806', '2471849', '13977481', '2950022', '2933419', '2304220', '5498636', '2279140', '1349887', '854665', '1854129', '898595', '11840863', '3986240', '5300506', '4649512', '1103024', '6864122', '4532764', '1184062', '17888946', '835325', '5701819', '2097347', '8870488', '6814258', '1306468', '1463615', '9595280', '795375', '2042661', '6980975', '4207778', '1839755', '1015292', '14075507', '844771', '2162716', '1770978', '1709548', '2326094', '6071138', '17264377', '8566777', '1789092', '1998141', '1627427', '833650', '2920258', '12046082', '1869749', '11273846', '8655152', '1488573', '1332202', '8566781', '993179', '3727183', '711384', '1502112', '2626258', '1323423', '4529996', '1387898', '1408391', '1280951', '4395214', '5791493', '2303536', '1707833', '1329896', '1908046', '1739134', '2252944', '15119564', '18003149', '15989898', '1619965', '1314741', '1368477', '6308137', '1644543', '6718957', '1698378', '4959570', '1525357', '8649470', '1595759', '2303537', '2304099', '1986965', '2490499', '15250033', '8597074', '1526981', '1703157', '11298942', '1798869', '12294649', '1437005', '6816299', '1695753', '6871854', '2206182', '3054390', '6299654', '6721247', '1210040', '10898061', '13976026', '15865112', '13985457', '469240', '2470659', '549251', '6838972', '5204628', '2914957', '5032767', '1645673', '2312434', '1429470', '2906680', '435607', '4424287', '798560', '2073118', '5523632', '2311188', '1253193', '2126276', '11427224', '1808821', '2301349', '1836442', '15922731', '1736057', '2311267', '6818992', '2041135', '1616172', '2386902', '9114127', '2204122', '17351705', '845942', '879203', '4339574', '8568026', '1102398', '17495705', '1008295', '1460811', '1889523', '12055098', '2069679', '1596944', '1119834', '4742242', '1764907', '1401578', '2095641', '2305511', '6265959', '5982624', '1052490', '2986144', '6843372', '2260063', '924487', '3266257', '6801788', '1465602', '2308508', '18067761', '18432655', '705349', '971378', '1814472', '1889169', '1777032', '2914855', '1614217', '1535444', '18518392', '1445079', '1612799', '1504077', '2906785', '906052', '2646526', '17893108', '5636962', '963260', '3244302', '6824856', '1644308', '4618578', '1993644', '12479474', '12256906', '4983724', '10900985', '732567', '2231219', '1072062', '14262700', '13693016', '2258446', '5661697', '5316052', '1634893', '5233755', '15056307', '6747312', '1301535', '14519435', '17335897', '1502226', '6721917', '1001338', '8041739', '993192', '4651952', '6871799', '1850056', '9557921', '1319623', '1742234', '1949303', '1599739', '6003321', '1071617', '6810859', '1631426', '5218811', '2386165', '8566081', '2893923', '1569543', '6954900', '17906307', '4034815', '8565415', '12105673', '1489399', '5759853', '8549254', '1461328', '2307742', '1922054', '1962367', '1398628', '6796297', '15994276', '455399', '2203410', '1200053', '1713536', '12549177', '1362058', '14638396', '5806178', '1923625', '6779978', '7219819', '1787276', '6793153', '1222494', '6030639', '837152', '1334738', '17189621', '2301612', '906433', '1308946', '16107844', '1722537', '1829608', '1606752', '5254309', '1631473', '1604443', '5798366', '3489164', '3735325', '1198392', '1243356', '8564790', '11536424', '5193260', '1121355', '6833209', '1980915', '854574', '2511064', '2139090', '5493032', '6841697', '993168', '6039857', '16439683', '14145730', '1330841', '1195843', '988446', '15226304', '1785226', '1235758', '18426085', '2165172', '8028304', '2008798', '2120334', '2513965', '470042', '6824503', '4440526', '1457160', '1348356', '6805325', '1839488', '2619209', '1009576', '2308012', '1347767', '19477009', '2206823', '9586386', '10226914', '2311798', '795534', '19492321', '1346881', '1993462', '1782566', '4328209', '15994123', '3031791', '11404014', '846376', '10606625', '5208789', '8531861', '2297087', '1744011', '2031344', '17592579', '9316213', '436422', '1211586', '5663016', '1414993', '6843499', '19590084', '5282585', '15342688', '4837255', '1054072', '3638973', '5923924', '1349470', '9786613', '482279', '2208578', '4836593', '839566', '2495411', '878720', '15893537', '17893138', '4633218', '809790', '1600207', '3109299', '1839280', '5381040', '1250715', '1764227', '15159962', '4014890', '8579631', '2264446', '4842539', '5459970', '2003991', '15046934', '6830006', '4633806', '2304541', '1743259', '7369044', '5874951', '1687320', '2574763', '1182404', '1926553', '2092881', '6311240', '635052', '1085128', '1797968', '1802283', '766504', '2353392', '1319287', '5894563', '6484420', '1829022', '6871609', '2304020', '6748294', '1522491', '1194069', '2382865', '3796605', '1899210', '3067463', '436752', '6801023', '2000917', '14091260', '447042', '17589188', '16096506', '2301607', '1892767', '1958197', '5247005', '982506', '15342925', '2306843', '12598692', '6857204', '2372516', '12428221', '1343316', '2299951', '6829094', '966899', '1638482', '1459191', '11714259', '1906031', '1911503', '1389794', '2106630', '2360381', '5982043', '1183291', '2310101', '1650889', '6730026', '4477933', '6827980', '1944386', '1633220', '7041178', '3180841', '759170', '2906597', '1288182', '7222127', '2309441', '6842542', '15827580', '2299539', '427759', '1722593', '775306', '4981226', '10846571', '1925650', '1678450', '818465', '1351400', '2344099', '2302029', '1712702', '2524337', '2312429', '1319521', '1285957', '1047944', '7912422', '1729342', '1412355', '19677076', '1280935', '2188170', '634796', '19631439', '1110871', '1520746', '2039084', '1501114', '931041', '2581996', '1961494', '1786417', '1097092', '1303601', '1217381', '6845679', '4649034', '5086893', '8573046', '1913837', '2308095', '5910628', '2298397', '758912', '2381746', '1938629', '8368908', '1631227', '2310745', '2085318', '841801', '2541857', '6848006', '19851030', '770407', '2299025', '6838783', '5404281', '1228912', '4397955', '1011380', '1016746', '2240086', '1601769', '6804544', '1887922', '14003764', '2306505', '3489224', '8179587', '2567956', '6861538', '2301906', '8651230', '914280', '6841811', '2058995', '1388182', '2304275', '6791966', '776385', '6865546', '1678938', '3459779', '1137716', '17900653', '2308454', '19602360', '15041311', '4122364', '1683004', '798558', '1333617', '1381002', '1737490', '484047', '19851349', '16947455', '2304648', '10549551', '1577193', '5459919', '6759220', '13730424', '988669', '1908062', '798644', '14478181', '3036132', '1361490', '3181983', '12660767', '4694948', '1941197', '1919981', '6733292', '1892923', '5209010', '1465078', '762449', '6823763', '1207976', '4337176', '1054109', '1540010', '833915', '2304059', '1521061', '1470049', '921161', '1050702', '768877', '6921582', '963349', '980038', '895742', '2994214', '1065688', '6718807', '482829', '13977015', '7492361', '1317258', '6902621', '1508152', '12256292', '2399462', '2008422', '1196576', '2310155', '1361053', '1104133', '6761971', '2343370', '15881473', '5804097', '1354994', '6811814', '6822780', '1839226', '9889366', '1608408', '912192', '5245144', '2325940', '17097820', '2893137', '1851132', '18067933', '5451884', '1412032', '1254404', '1675785', '1715021', '6265951', '6824568', '1207365', '4841942', '1894207', '9203734', '11373693', '3200989', '11535981', '1471363', '6839672', '1598314', '450374', '2306282', '5372200', '1137010', '3327517', '6520221', '4120903', '18409880', '1863633', '6797498', '1995687', '1791401', '1320882', '1583786', '963952', '1010154', '2599383', '11156161', '4041766', '19572797', '14418843', '6479820', '17079333', '946201', '3995602', '2041772', '1192747', '6078076', '1511679', '1348739', '4823149', '444245', '873401', '965856', '6727128', '1307800', '12170083', '17346200', '8495720', '6797781', '2376009', '19477511', '2728223', '2217056', '822382', '2169020', '2333647', '6634962', '5752588', '835660', '3244548', '19476660', '6873093', '1070932', '816161', '1045189', '19678374', '1611852', '1249476', '1721909', '19627641', '1990741', '13927820', '6901761', '1357219', '1484559', '5906858', '17503276', '1500995', '1696860', '2223931', '2353468', '13971931', '11365915', '1465055', '2331763', '6763293', '1773977', '916526', '1789218', '4641595', '3395271', '12550514', '979162', '2069825', '781670', '1612893', '1464944', '968489', '6902661', '1111390', '1776878', '6760659', '979804', '2168623', '980000', '2330984', '1213609', '1317958', '1695623', '2707449', '1766786', '1053538', '1994732', '2124857', '2221080', '835805', '6870587', '2079079', '1409557', '1979270', '428777', '975788', '2310203', '1387827', '2346619', '1718925', '10156022', '827374', '1650498', '2313209', '1413099', '1380688', '16291107', '1679551', '1842265', '6820946', '790693', '1827589', '11282613', '1910900', '1463375', '6727289', '10188651', '1579908', '2384843', '6839822', '19634090', '790224', '2297184', '4341061', '2306823', '16484718', '1743447', '18560660', '4710561', '1298920', '1401656', '19680062', '18473944', '1683572', '1951420', '5541435', '7449276', '1715739', '18067041', '2310954', '4719752', '1688863', '13326756', '6832924', '4999322', '1619932', '954127', '1351695', '6724410', '2390517', '2306530', '1250910', '1466570', '839213', '17446895', '5907176', '4332933', '840601', '1646291', '1110729', '2304289', '2728174', '4508775', '4742121', '1379313', '3328165', '9647823', '5753999', '12645410', '8903301', '1758586', '844960', '3002297', '12606378', '19581675', '2308745', '1917737', '1071975', '6434060', '5323650', '1440419', '1925879', '6742360', '1759453', '4065724', '1404812', '2732826', '1905692', '1709475', '17017060', '752449', '2307493', '4888867', '2893973', '1622461', '2299402', '1905164', '1915934', '1882087', '13876659', '11409175', '1809359', '2304167', '1360923', '1118092', '15039253', '6010033', '2624024', '2620786', '6927999', '1218388', '807238', '1067582', '11160436', '854138', '1502248', '1294447', '15268291', '5372117', '18112268', '4370760', '1918415', '764842', '13707133', '1298628', '5887871', '6800600', '1246063', '1804256', '1569998', '795756', '2525704', '1467225', '6081351', '11303095', '2302170', '1848116', '2303594', '1712679', '992528', '1637130', '1007242', '1430342', '1450782', '979098', '1407479', '3627829', '968910', '4980528', '1300070', '6108090', '917986', '1749156', '2305461', '8568249', '834897', '2324627', '2540874', '2905580', '6750275', '2637083', '13969024', '14999558', '3637295', '4528873', '1732929', '1215531', '1619217', '2081017', '1136745', '3084247', '1526823', '2305897', '1583807', '7926640', '1137305', '1434311', '436747', '1096653', '2379356', '8950326', '17900872', '1093613', '481846', '1543141', '8077253', '826324', '13975974', '3631036', '15154383', '1054122', '1435324', '6873742', '792644', '2044046', '19494449', '14075818', '17862934', '8560572', '2172772', '4818634', '12660988', '1784364', '8554722', '969561', '1728443', '5081100', '2307663', '5089471', '2116648', '5248169', '1622838', '16944164', '766524', '2098635', '1303572', '987758', '3360067', '2277831', '947030', '4056690', '15999519', '2053854', '1478572', '1300612', '6116304', '11276210', '12257463', '1962874', '6747510', '1476031', '1871559', '8922158', '1452349', '6844939', '12854466', '2204595', '2142669', '1008002', '2229615', '4509860', '925695', '1992537', '434835', '1682883', '1520753', '6413435', '798819', '6793680', '6980424', '2205324', '1257665', '1718158', '6478463', '8797159', '2167716', '19574584', '438270', '9313822', '6780661', '15814641', '1344625', '6821746', '2612086', '6802848', '17926541', '1744343', '2302239', '2858562', '1355071', '1438179', '19852654', '6609491', '6790391', '4440371', '1603752', '6827629', '15898306', '898426', '4041868', '8554140', '2220597', '4208092', '5754265', '1340318', '1615606', '1350925', '6790018', '12628531', '9551836', '2880515', '8514734', '1431075', '895694', '15046886', '1535496', '12647934', '1357754', '8671770', '814942', '1509159', '3109993', '6483710', '9665292', '697586', '5659366', '11210480', '981463', '4685029', '10211171', '8924120', '2284075', '1255468', '6059856', '15901234', '1227176', '4055862', '1233069', '1251491', '5349201', '1921469', '17297413', '835759', '821666', '1408382', '1617373', '2511238', '1712753', '4614517', '1599684', '2271037', '1628002', '1902888', '4603362', '12627909', '6846805', '1181571', '896718', '1764841', '2509643', '968899', '996060', '1810879', '6748213', '17334992', '2001384', '4605963', '1441041', '1615538', '2302765', '2496006', '2313036', '2300868', '2304247', '2588584', '1842680', '2705434', '6285088', '8013613', '1002876', '11514993', '970592', '2157061', '10475321', '17335633', '11296573', '915647', '1589885', '5919845', '12392940', '2304457', '1693127', '1015222', '1695275', '16001153', '8023456', '1908021', '1732778', '1500506', '2159620', '986647', '6780930', '1452112', '811321', '15303900', '6043814', '3045493', '1350205', '1785726', '1082106', '1887117', '6724960', '2361683', '1929367', '2636634', '812225', '2306439', '1013653', '8511573', '8328830', '930117', '3000051', '6816940', '1097910', '1521776', '2302468', '1726961', '981556', '9241131', '4830118', '1767157', '6866998', '818830', '9596164', '5235155', '2135921', '4455892', '6300082', '811480', '7081741', '13423009', '3180394', '1619986', '1289886', '1485601', '754253', '1467429', '1095257', '1610700', '6687934', '5983375', '3188906', '2986787', '1218197', '5646965', '10883638', '2080044', '1183886', '6902759', '11246826', '1083969', '900611', '2285492', '1692812', '2368567', '12828466', '2310655', '1316857', '2014068', '2200929', '5255921', '1808803', '8588850', '2998469', '12550428', '1997475', '2306599', '3364787', '1244847', '1594929', '6841820', '1307053', '1018948', '4379944', '2312830', '2163556', '1060857', '1361872', '1426026', '1535879', '1256544', '5226446', '1861234', '5254389', '834626', '8837305', '18409390', '2623197', '8571754', '2608518', '9133869', '5732792', '1489179', '1306988', '1485951', '835809', '1348759', '2189468', '19080025', '13877623', '6522753', '1973806', '6060617', '4372283', '1212637', '807158', '9245783', '8530263', '443584', '1925657', '6402528', '2068887', '1541078', '1590592', '1580314', '2506579', '691591', '12265058', '3456520', '9586871', '768413', '825220', '1959294', '1316883', '1997490', '2120396', '1888935', '1797641', '1674950', '6857309', '1458173', '1703855', '12273942', '1061226', '798598', '1786400', '1599114', '1923793', '895308', '6742147', '2593370', '2284556', '6513972']
proxy_url='http://123.207.35.36:5010/get/'
JS_PATH = '/Users/admin/Documents/scrapyceshi/ctripcomments/ctripcomments/spiders/callback.js'
api="http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx"
ids_path='/Users/admin/Documents/scrapyceshi/ctripcomments/ctripcomments/spiders/newids.txt'
class Version1Spider(scrapy.Spider):
name = 'version1'
total=0
# 拿到callback并发起请求获取eleven加密的js文件
def start_requests(self):
params = {"MasterHotelID": "",
"hotel": "",
"currentPage": "1",
"viewVersion": "c",
"eleven": "",
}
with open(ids_path)as f:
content = f.read().strip().split('\n')
num=0
for id in content:
# num+=1
id = id.strip()
if (id not in old)and(not redis_cli.sismember('baseids',id)):
num+=1
# 每次取666id
if num<888:
print(
'**************************************第{}个id{}开始爬取评论**********************************'.format(
num, id))
while True:
try:
oceanball, cas = self.get_oceanball()
break
except:
print('等待')
time.sleep(2)
# proxy = requests.get(proxy_url).text
# proxy = "http://{}".format(proxy)
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
# 'Referer': 'http://hotels.ctrip.com/hotel/{}.html'.format(id)
# }
headers = self.url_start()
headers['Referer']='http://hotels.ctrip.com/hotel/{}.html'.format(id)
# print(headers)
yield scrapy.Request(url=oceanball,headers=headers,meta={'cas':cas,'headers':headers,'id':id,'params':params,'proxy':proxy},callback=self.ocean_parse,dont_filter=True)
else:
break
# 生成callback
def get_oceanball(self):
oceanball = 'http://hotels.ctrip.com/domestic/cas/oceanball?callback=%s&_=%s'
f = open(JS_PATH)
callback_js = f.read()
with PyV8.JSContext() as ctxt:
ctxt.eval('var callback = %s' % callback_js)
ctxt.eval('cas = callback(15)')
ctxt.eval('var current_time = (new Date).getTime()')
vars = ctxt.locals
cas = vars.cas
current_time = vars.current_time
oceanball = oceanball % (cas, int(current_time))
return (oceanball, cas)
# 拿到js文件并且解密
def ocean_parse(self,response):
params=response.meta['params']
print('*'*8,params['currentPage'])
ocean=response.body.decode('utf-8')
cas=response.meta['cas']
hotel_id=response.meta['id']
headers=response.meta['headers']
#可能在js里面下毒
if ocean[:4] == 'eval':
try:
ocean = ocean.replace('eval', 'JSON.stringify')
ctxt = PyV8.JSContext()
ctxt.__enter__()
ocean = ctxt.eval(ocean)
ocean = eval(ocean)
ocean = ocean.replace(cas, 'eleven=' + cas)
with PyV8.JSContext() as ctxt:
ctxt.eval(
'var hotel_id = "%s"; var site = {}; site.getUserAgent = function(){}; var Image = function(){}; var window = {}; window.document = {body:{innerHTML:"1"}, documentElement:{attributes:{webdriver:"1"}}, createElement:function(x){return {innerHTML:"1"}}}; var document = window.document;window.navigator = {"appCodeName":"Mozilla", "appName":"Netscape", "language":"zh-CN", "platform":"Win"}; window.navigator.userAgent = site.getUserAgent(); var navigator = window.navigator; window.location = {}; window.location.href = "http://hotels.ctrip.com/hotel/"+hotel_id+".html"; var location = window.location;' % hotel_id)
ctxt.eval('var navigator = {userAgent:{indexOf: function(x){return "1"}}, geolocation:"1"}')
ctxt.eval('var %s = function(x){return x()}' % cas)
ctxt.eval(ocean)
vars = ctxt.locals
eleven = vars.eleven
# 已经删除多余参数
print('ok'*10)
params['MasterHotelID'] = hotel_id
params['hotel'] = hotel_id
params['eleven'] = eleven
# print(headers)
yield scrapy.FormRequest(url=api,method='GET',meta={'params':params,'hotel_id':hotel_id,'headers':headers,'proxy':proxy},formdata=params,headers=headers,dont_filter=True,callback=self.comments_parse)
except Exception as e:
print(e,response.body.decode('utf-8'))
else:
# 重新获取js代码,回调自己
print(ocean[:4])
oceanball, cas = self.get_oceanball()
# proxy = requests.get(proxy_url).text
# proxy = "http://{}".format(proxy)
yield scrapy.Request(url=oceanball, headers=headers,
meta={'cas': cas, 'headers': headers, 'id': hotel_id, 'params': params,'proxy':proxy},
callback=self.ocean_parse, dont_filter=True)
print('{}第{}页重试,重试原因:js被下毒了'.format(hotel_id, params['currentPage']))
print(response.body.decode('utf-8'))
# 获取第一页评论
def comments_parse(self, response):
params = response.meta['params']
print('*'*18,params['currentPage'])
hotel_id = response.meta['hotel_id']
headers = response.meta['headers']
if '此酒店暂无点评' in response.body.decode('utf-8'):
print('此酒店暂无点评')
redis_cli.sadd('baseids', hotel_id)
else:
patterns = response.xpath('//div[@class="comment_block J_asyncCmt"]')
with open('result5.csv','a',encoding='utf-8',newline='')as f:
writer = csv.writer(f)
for pattern in patterns:
# 评论页往后存在,字段缺失,房型和旅客类型缺失;
try:
name=pattern.xpath('.//p[@class="name"]/span/text()').extract()[0]
except:
name=''
try:
room_type=pattern.xpath('//a[@class="room J_baseroom_link"]/text()').extract()[0]
except:
room_type=''
try:
date=pattern.xpath('//span[@class="date"]/text()').extract()[0]
except:
date=''
try:
guest_type=pattern.xpath('//span[@class="type"]/text()').extract()[0]
except:
guest_type=''
info = [hotel_id,name,room_type,date,guest_type]
writer.writerow(info)
redis_cli.sadd('ctrip_comments',info)
print(info)
redis_cli.sadd('baseids',hotel_id)
self.total+=1
print(self.total)
try:
# 评论量
comments = response.xpath('//span[@id="All_Comment"]/text()').extract()[0]
comments=re.findall(r'全部\((\d+)\)',comments)[0]
comments = int(comments)
print('{}评论总量{}'.format(hotel_id,comments))
except:
# 反爬处理,,,,,,如果响应体为空,那么背反爬,需要重试
if not response.body.decode('utf-8'):
while True:
try:
oceanball, cas = self.get_oceanball()
# proxy = requests.get(proxy_url).text
# proxy = "http://{}".format(proxy)
yield scrapy.Request(url=oceanball, headers=headers,
meta={'cas': cas, 'headers': headers, 'id': hotel_id, 'params': params,'proxy':proxy},
callback=self.ocean_parse, dont_filter=True)
print('{}第{}页重试'.format(hotel_id, params['currentPage']))
break
except:
time.sleep(2)
else:
currentPage = params['currentPage']
if int(currentPage) == 1:
# 翻页取评论数据
if comments>15:
pages = math.ceil(comments/15)
for page in range(2,pages+1):
a = {}
b = dict(a,**params)
print('{}第{}页开始爬'.format(hotel_id,page))
oceanball, cas = self.get_oceanball()
b['currentPage']=str(page)
print(b)
# proxy = requests.get(proxy_url).text
# proxy = "http://{}".format(proxy)
yield scrapy.Request(url=oceanball, headers=headers,
meta={'cas': cas, 'headers': headers, 'id': hotel_id,'params':b,'proxy':proxy}, callback=self.ocean_parse,dont_filter=True)
print('{}第{}页爬完'.format(hotel_id,pages))
else:
print('{}正在循环翻页'.format(hotel_id))
def url_start(self):
times = int(time.time())
planText = "orderno=ZxxxxxxxxQO,secret=fbba4xxxxbd,timestamp={}".format(times)
md = hashlib.md5()
md.update(planText.encode('utf-8'))
content = md.hexdigest()
headers = {'User-Agent':random.choice(uas)}
# ua = UserAgent()
# headers['User-Agent'] = ua.random
headers['Proxy-Authorization'] = 'sign={}&orderno=ZFxxxxxxxO×tamp={}'.format(content.upper(),
times)
return headers