jsoup提取连接下载网站图片
所使用的包分别为commons-httpclient.jar和jsoup-1.6.1.jar;
利用jsoup提取,页面src路径;
利用 httpclient下载网站图片
01 |
public class DownImages { |
|
02 |
private static int COUNT = 0; |
03 |
private static int DOWN_COUNT = 0; |
|
04 |
|
05 |
public static void jsoupHTML(String urlPath) throws Exception{ |
06 |
Document doc = Jsoup.connect(urlPath).timeout(1000000).get(); |
07 |
//:当前页中的图片 |
|
08 |
Elements srcLinks = doc.select("img[src$=.jpg]"); |
09 |
for (Element link : srcLinks) { |
|
10 |
//:剔除标签,只剩链接路径 |
11 |
String imagesPath = link.attr("src"); |
|
12 |
System.out.println("当前访问路径:"+imagesPath); |
13 |
getImages(imagesPath, "d://images//0000"+ ++COUNT +".jpg"); |
|
14 |
} |
15 |
|
|
16 |
//:提取网站中所有的href连接 |
17 |
Elements linehrefs = doc.select("a[href]"); |
|
18 |
|
19 |
for (Element linehref : linehrefs) { |
|
20 |
String lihr = linehref.attr("href"); |
21 |
if(lihr.length()>4){ |
|
22 |
String ht = lihr.substring(0, 4); |
23 |
String htt = lihr.substring(0, 1); |
24 |
if(!ht.equals("http") && htt.equals("/")){ |
25 |
lihr = urlPath + lihr; |
|
26 |
} |
27 |
if(lihr.substring(0, 4).equals("http")){ |
|
28 |
Document docs = Jsoup.connect(lihr).timeout(1000000).get(); |
29 |
Elements links = docs.select("img[src$=.jpg]"); |
30 |
for (Element link : links) { |
31 |
//:剔除标签,只剩链接路径 |
32 |
String imagesPath = link.attr("src"); |
33 |
System.out.println("当前访问路径:"+imagesPath); |
|
34 |
getImages(imagesPath, "d://images//0000"+ COUNT++ +".jpg"); |
35 |
} |
|
36 |
} |
37 |
} |
|
38 |
} |
39 |
} |
|
40 |
|
41 |
|
|
42 |
/** |
43 |
* @param urlPath 图片路径 |
|
44 |
* @throws Exception |
45 |
*/ |
|
46 |
public static void getImages(String urlPath,String fileName) throws Exception{ |
47 |
URL url = new URL(urlPath);//:获取的路径 |
|
48 |
//:http协议连接对象 |
49 |
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
|
50 |
conn.setRequestMethod("GET"); |
51 |
conn.setReadTimeout(6 * 10000); |
|
52 |
if (conn.getResponseCode() <10000){ |
53 |
InputStream inputStream = conn.getInputStream(); |
|
54 |
byte[] data = readStream(inputStream); |
55 |
if(data.length>(1024*10)){ |
|
56 |
FileOutputStream outputStream = new FileOutputStream(fileName); |
57 |
outputStream.write(data); |
|
58 |
System.err.println("第"+ ++DOWN_COUNT +"图片下载成功"); |
59 |
outputStream.close(); |
|
60 |
} |
61 |
} |
|
62 |
|
63 |
} |
|
64 |
|
65 |
/** |
|
66 |
* 读取url中数据,并以字节的形式返回 |
67 |
* @param inputStream |
|
68 |
* @return |
69 |
* @throws Exception |
|
70 |
*/ |
71 |
public static byte[] readStream(InputStream inputStream) throws Exception{ |
72 |
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); |
73 |
byte[] buffer = new byte[1024]; |
|
74 |
int len = -1; |
75 |
while((len = inputStream.read(buffer)) !=-1){ |
|
76 |
outputStream.write(buffer, 0, len); |
77 |
} |
|
78 |
outputStream.close(); |
79 |
inputStream.close(); |
|
80 |
return outputStream.toByteArray(); |
81 |
} |
|
82 |
|
83 |
public static void main(String[] args) { |
|
84 |
try { |
85 |
String urlPath = "http://www.22mm.cc/"; |
|
86 |
jsoupHTML(urlPath); |
87 |
} catch (Exception e) { |
|
88 |
e.printStackTrace(); |
89 |
}finally{ |
|
90 |
System.out.println("共访问"+COUNT+"张图片,其中下载"+DOWN_COUNT+"张图片"); |
91 |
} |
|
92 |
} |
93 |
} |
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。