爬虫(整页)
2017-09-22 本文已影响2人
晨曦_hero
public class Text02 {
public static void main(String[] args) {
// 图片下载对象
final ImageDownload imgDownload = new ImageDownload();
// 生产图片链接的(3个)
// (一)
new Thread(new Runnable() {
@Override
public void run() {
for (int i = 1; i < 500; i++) {
try {
imgDownload.parseImageUrl(i);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
// (二)
new Thread(new Runnable() {
@Override
public void run() {
for (int i = 500; i < 1000; i++) {
try {
imgDownload.parseImageUrl(i);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
// (三)
new Thread(new Runnable() {
@Override
public void run() {
for (int i = 1000; i < 1500; i++) {
try {
imgDownload.parseImageUrl(i);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
// 下载图片(5个)
// (1)
new Thread(new Runnable() {
@Override
public void run() {
while (true) {// 一直在下载
try {
imgDownload.downloadimage();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
// (2)
new Thread(new Runnable() {
@Override
public void run() {
while (true) {// 一直在下载
try {
imgDownload.downloadimage();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
// (3)
new Thread(new Runnable() {
@Override
public void run() {
while (true) {// 一直在下载
try {
imgDownload.downloadimage();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
// (4)
new Thread(new Runnable() {
@Override
public void run() {
while (true) {// 一直在下载
try {
imgDownload.downloadimage();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
// (5)
new Thread(new Runnable() {
@Override
public void run() {
while (true) {// 一直在下载
try {
imgDownload.downloadimage();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
}
}
// 图片下载类
class ImageDownload {
// imageurl的数组
LinkedList<String> list = new LinkedList<String>();
// 解析连接
void parseImageUrl(int page) throws Exception {
synchronized (this) {
if (list.size()>100) {
wait();
}
String htmlUrl = "https://www.doutula.com/article/list/?page=" + page;
Document doc = Jsoup.connect(htmlUrl).get();
// 获取图片连接
Elements els = doc.select(".lazy,.image_dtb,.img--responsive");// cssQuery// css选择器// (有 id// tagName// 复合选择器)
Iterator iter = els.iterator();
while (iter.hasNext()) {
Element obj = (Element) iter.next();
String imageUrl = obj.attr("data-original");
if (!imageUrl.contains(htmlUrl)) {
continue;
}
this.list.add(imageUrl);
}
//生产完毕 通知可以下载图片
notifyAll();
}
}
// 下载图片
void downloadimage() throws Exception {
synchronized (this) {
if (this.list.size()<= 0) {
wait();
}
// URL url = new URL(this.list.get(0));
// this.list.removeFirst();
// 拿一个删除一个用完之后就删除(合二为一)
String imageUrl =null;
if (this.list.size()>0) {
imageUrl = this.list.removeFirst();
}else{
return;
}
URL url = new URL(imageUrl);
URLConnection connection = url.openConnection();// 创建连接
connection.connect();// 发送连接
InputStream in = connection.getInputStream();
byte[] by = new byte[1024];
int length = -1;
// out流
String[] imageUrlArr = imageUrl.split("/");
String fileName = "Image/" + imageUrlArr[imageUrlArr.length - 1];
FileOutputStream fileout = new FileOutputStream(fileName);
while ((length = in.read(by)) != -1) {// (循环为了防止1024放满不够用)
// 把流写入本地文件夹里
fileout.write(by, 0, length);// 截取 防止覆盖不了时候
}
fileout.close();
in.close();
//通知可以继续生产解析图片
if (this.list.size()<100) {
notify();
}
}
}
}