网站爬虫开发以及SQL注入检测插件部分代码
2017-05-31 本文已影响143人
c4a1d989518e
import requests,re,random
BOOLEAN_TESTS = (" AND %d=%d", " OR NOT (%d=%d)")
DBMS_ERRORS = { # regular expressions used for DBMS recognition based on error message response
"MySQL": (r"SQL syntax.*MySQL", r"Warning.*mysql_.*", r"valid MySQL result", r"MySqlClient\."),
"PostgreSQL": (r"PostgreSQL.*ERROR", r"Warning.*\Wpg_.*", r"valid PostgreSQL result", r"Npgsql\."),
"Microsoft SQL Server": (r"Driver.* SQL[\-\_\ ]*Server", r"OLE DB.* SQL Server", r"(\W|\A)SQL Server.*Driver", r"Warning.*mssql_.*", r"(\W|\A)SQL Server.*[0-9a-fA-F]{8}", r"(?s)Exception.*\WSystem\.Data\.SqlClient\.", r"(?s)Exception.*\WRoadhouse\.Cms\."),
"Microsoft Access": (r"Microsoft Access Driver", r"JET Database Engine", r"Access Database Engine"),
"Oracle": (r"\bORA-[0-9][0-9][0-9][0-9]", r"Oracle error", r"Oracle.*Driver", r"Warning.*\Woci_.*", r"Warning.*\Wora_.*"),
"IBM DB2": (r"CLI Driver.*DB2", r"DB2 SQL error", r"\bdb2_\w+\("),
"SQLite": (r"SQLite/JDBCDriver", r"SQLite.Exception", r"System.Data.SQLite.SQLiteException", r"Warning.*sqlite_.*", r"Warning.*SQLite3::", r"\[SQLITE_ERROR\]"),
"Sybase": (r"(?i)Warning.*sybase.*", r"Sybase message", r"Sybase.*Server message.*"),
}
def sqlcheck(url):
if(not url.find("?")):
return False
_url = url + "%29%28%22%27" #先用)("'使报错
_content = request.get(_url).text
for (dbms, regex) in ((dbms, regex) for dbms in DBMS_ERRORS for regex in DBMS_ERRORS[dbms]):
if(re.search(regex,_content)):
return True
content = {}
content["origin"] = request.get(_url).text
for test_payload in BOOLEAN_TESTS:
#正确的网页
RANDINT = random.randint(1, 255)
_url = url + test_payload%(RANDINT,RANDINT)
content["true"] = downloader.request(_url)
_url = url + test_payload%(RANDINT,RANDINT+1)
content["false"] = downloader.request(_url)
if content["origin"]==content["true"]!=content["false"]:
return "sql fonud: %"%url
- 代码中的
AND %d=%d", " OR NOT (%d=%d)
通过在url 后面加上 AND %d=%d 或者 OR NOT (%d>%d),用来判断SQL用的。 -
DBMS_ERRORS
中的内容,如果查询时,页面会报错,通过页面报错的内容,抓取其中的语句,就可以判断后台是用的什么数据库。这些常用的报错语句收集起来,就是DBMS_ERRORS
了,DBMS是Database Manager Server的简称。 -
url.find("?")
是用来判断网页的url中是否有?
来判读的,?是什么意思呢?涉及到一个表格:
URL符号 | 含义 |
---|---|
? | 分隔实际的URL和参数 |
/ | 分隔目录和子目录 |
& | URL中指定的参数间的分隔符 |
= | URL中指定的参数的值 |
+ | 表示空格(在URL中不能使用空格) |
# | 表示书签 |
- 用正则匹配页面返回值与
DBMS_ERRORS
中的语句
for (dbms, regex) in ((dbms, regex) for dbms in DBMS_ERRORS for regex in DBMS_ERRORS[dbms]):
if(re.search(regex,_content)):
return True
regex是正则表达式。
-
RANDINT
这里有什么用如果不懂得话,可以看下图。
运行结果
-
content[origin]
是原始网页,意思就是当原始的网页等于正确的网页不等于错误的网页内容时就可以判定这个地址存在注入漏洞。
content["origin"]==content["true"]!=content["false"]
接下来就是抓取URL网页了
收集网站的链接,爬虫需要记录一下已经爬取的链接和待爬取的链接,并且去重复。
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
- 去重复,用Python的set()函数,set()函数用法:
>>> a = [11,22,33,44,11,22]
>>> b = set(a)
>>> b
set([33, 11, 44, 22])
- 代码中有一个需要学习的地方就是代码结尾处的:
new_url = self.new_urls.pop()
这个pop方法。
下载网页内容
import requests
class Downloader(object):
def get(self,url):
r = requests.get(url,timeout=10)
if r.status_code != 200:
return None
_str = r.text
return _str
def post(self,url,data):
r = requests.post(url,data)
_str = r.text
return _str
def download(self, url,htmls):
if url is None:
return None
_str = {}
_str["url"] = url
try:
r = requests.get(url, timeout=10)
if r.status_code != 200:
return None
_str["html"] = r.text
except Exception as e:
return None
htmls.append(_str)
-
requests.get(url,timeout=10)
和r.status_code != 200
还有_str = r.text
的用法 -
requests.post(url,data)
的用法 - 还有就是
_str = {}
中字典的用法。