Java爬蟲下載千張美女圖片!

來源:https://blog.csdn.net/qq_35402412
目的
準備工作
分析
思路
設(shè)置URL請求參數(shù)
訪問URL請求,獲取圖片地址
圖片地址存入List
遍歷List,使用線程池下載到本地
代碼
SougouImgProcessor.java?爬取圖片類
import com.alibaba.fastjson.JSONObject;import us.codecraft.webmagic.utils.HttpClientUtils;import victor.chang.crawler.pipeline.SougouImgPipeline;import java.util.ArrayList;import java.util.List;/*** A simple PageProcessor.** @author code4crafter@gmail.com* @since 0.1.0*/public class SougouImgProcessor {private String url;private SougouImgPipeline pipeline;private ListdataList; private ListurlList; private String word;public SougouImgProcessor(String url,String word) {this.url = url;this.word = word;this.pipeline = new SougouImgPipeline();this.dataList = new ArrayList<>();this.urlList = new ArrayList<>();}public void process(int idx, int size) {String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));JSONObject object = JSONObject.parseObject(res);Listitems = (List )((JSONObject)object.get("data")).get("items"); for(JSONObject item : items){this.urlList.add(item.getString("picUrl"));}this.dataList.addAll(items);}// 下載public void pipelineData(){// pipeline.process(this.urlList, word); // 單線程pipeline.processSync(this.urlList, this.word); // 多線程}public static void main(String[] args) {String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";SougouImgProcessor processor = new SougouImgProcessor(url,"美女");int start = 0, size = 50, limit = 1000; // 定義爬取開始索引、每次爬取數(shù)量、總共爬取數(shù)量for(int i=start;iprocessor.process(i, size);processor.pipelineData();}}
SougouImgPipeline.java? ?圖片下載類
import?java.io.File;import java.io.FileOutputStream;import java.io.InputStream;import java.net.URL;import java.net.URLConnection;import java.util.List;import java.util.Objects;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import java.util.concurrent.TimeUnit;import java.util.concurrent.atomic.AtomicInteger;/*** Store results in files.** @author [email protected]* @since 0.1.0*/public class SougouImgPipeline {private String extension = ".jpg";private String path;private volatile AtomicInteger suc;private volatile AtomicInteger fails;public SougouImgPipeline() {setPath("E:/pipeline/sougou");suc = new AtomicInteger();fails = new AtomicInteger();}public SougouImgPipeline(String path) {setPath(path);suc = new AtomicInteger();fails = new AtomicInteger();}public SougouImgPipeline(String path, String extension) {setPath(path);this.extension = extension;suc = new AtomicInteger();fails = new AtomicInteger();}public void setPath(String path) {this.path = path;}/*** 下載** @param url* @param cate* @throws Exception*/private void downloadImg(String url, String cate, String name) throws Exception {String path = this.path + "/" + cate + "/";File dir = new File(path);if (!dir.exists()) { // 目錄不存在則創(chuàng)建目錄dir.mkdirs();}String realExt = url.substring(url.lastIndexOf(".")); // 獲取擴展名String fileName = name + realExt;fileName = fileName.replace("-", "");String filePath = path + fileName;File img = new File(filePath);if(img.exists()){ // 若文件之前已經(jīng)下載過,則跳過System.out.println(String.format("文件%s已存在本地目錄",fileName));return;}URLConnection con = new URL(url).openConnection();con.setConnectTimeout(5000);con.setReadTimeout(5000);InputStream inputStream = con.getInputStream();byte[] bs = new byte[1024];File file = new File(filePath);FileOutputStream os = new FileOutputStream(file, true);// 開始讀取 寫入int len;while ((len = inputStream.read(bs)) != -1) {os.write(bs, 0, len);}System.out.println("picUrl: " + url);System.out.println(String.format("正在下載第%s張圖片", suc.getAndIncrement()));}/*** 單線程處理** @param data* @param word*/public void process(Listdata, String word) {long start = System.currentTimeMillis();for (String picUrl : data) {if (picUrl == null)continue;try {downloadImg(picUrl, word, picUrl);} catch (Exception e) {// e.printStackTrace();fails.incrementAndGet();}}System.out.println("下載成功: " + suc.get());System.out.println("下載失敗: " + fails.get());long end = System.currentTimeMillis();System.out.println("耗時:" + (end - start) / 1000 + "秒");}/*** 多線程處理** @param data* @param word*/public void processSync(Listdata, String word) {long start = System.currentTimeMillis();int count = 0;ExecutorService executorService = Executors.newCachedThreadPool(); // 創(chuàng)建緩存線程池for (int i=0;iString picUrl = data.get(i);if (picUrl == null)continue;String name = "";if(i<10){name="000"+i;}else if(i<100){name="00"+i;}else if(i<1000){name="0"+i;}String finalName = name;executorService.execute(() -> {try {downloadImg(picUrl, word, finalName);} catch (Exception e) {// e.printStackTrace();fails.incrementAndGet();}});count++;}executorService.shutdown();try {if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {// 超時的時候向線程池中所有的線程發(fā)出中斷(interrupted)。// executorService.shutdownNow();}System.out.println("AwaitTermination Finished");System.out.println("共有URL: "+data.size());System.out.println("下載成功: " + suc);System.out.println("下載失敗: " + fails);File dir = new File(this.path + "/" + word + "/");int len = Objects.requireNonNull(dir.list()).length;System.out.println("當前共有文件:"+len);long end = System.currentTimeMillis();System.out.println("耗時:" + (end - start) / 1000.0 + "秒");} catch (InterruptedException e) {e.printStackTrace();}}/*** 多線程分段處理** @param data* @param word* @param threadNum*/public void processSync2(Listdata, final String word, int threadNum) {if (data.size() < threadNum) {process(data, word);} else {ExecutorService executorService = Executors.newCachedThreadPool();int num = data.size() / threadNum; //每段要處理的數(shù)量for (int i = 0; i < threadNum; i++) {int start = i * num;int end = (i + 1) * num;if (i == threadNum - 1) {end = data.size();}final ListcutList = data.subList(start, end); executorService.execute(() -> process(cutList, word));}executorService.shutdown();}}}
HttpClientUtils.java? ? http請求工具類
import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.NameValuePair;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.methods.HttpUriRequest;import org.apache.http.conn.ssl.SSLConnectionSocketFactory;import org.apache.http.conn.ssl.TrustStrategy;import org.apache.http.entity.StringEntity;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.message.BasicNameValuePair;import org.apache.http.ssl.SSLContextBuilder;import org.apache.http.util.EntityUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import javax.net.ssl.HostnameVerifier;import javax.net.ssl.SSLContext;import javax.net.ssl.SSLSession;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.security.GeneralSecurityException;import java.security.cert.CertificateException;import java.security.cert.X509Certificate;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;/*** @author [email protected]* Date: 17/3/27*/public abstract class HttpClientUtils {public static Map<String, List<String>> convertHeaders(Header[] headers) {Map<String, List<String>> results = new HashMap<String, List<String>>();for (Header header : headers) {List<String> list = results.get(header.getName());if (list == null) {list = new ArrayList<String>();results.put(header.getName(), list);}list.add(header.getValue());}return results;}/*** http的get請求** @param url*/public static String get(String url) {return get(url, "UTF-8");}public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);/*** http的get請求** @param url*/public static String get(String url, String charset) {HttpGet httpGet = new HttpGet(url);return executeRequest(httpGet, charset);}/*** http的get請求,增加異步請求頭參數(shù)** @param url*/public static String ajaxGet(String url) {return ajaxGet(url, "UTF-8");}/*** http的get請求,增加異步請求頭參數(shù)** @param url*/public static String ajaxGet(String url, String charset) {HttpGet httpGet = new HttpGet(url);httpGet.setHeader("X-Requested-With", "XMLHttpRequest");return executeRequest(httpGet, charset);}/*** @param url* @return*/public static String ajaxGet(CloseableHttpClient httpclient, String url) {HttpGet httpGet = new HttpGet(url);httpGet.setHeader("X-Requested-With", "XMLHttpRequest");return executeRequest(httpclient, httpGet, "UTF-8");}/*** http的post請求,傳遞map格式參數(shù)*/public static String post(String url, Map<String, String> dataMap) {return post(url, dataMap, "UTF-8");}/*** http的post請求,傳遞map格式參數(shù)*/public static String post(String url, Map<String, String> dataMap, String charset) {HttpPost httpPost = new HttpPost(url);try {if (dataMap != null) {Listnvps = new ArrayList (); for (Map.Entry<String, String> entry : dataMap.entrySet()) {nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));}UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);formEntity.setContentEncoding(charset);httpPost.setEntity(formEntity);}} catch (UnsupportedEncodingException e) {e.printStackTrace();}return executeRequest(httpPost, charset);}/*** http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)*/public static String ajaxPost(String url, Map<String, String> dataMap) {return ajaxPost(url, dataMap, "UTF-8");}/*** http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)*/public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {HttpPost httpPost = new HttpPost(url);httpPost.setHeader("X-Requested-With", "XMLHttpRequest");try {if (dataMap != null) {Listnvps = new ArrayList (); for (Map.Entry<String, String> entry : dataMap.entrySet()) {nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));}UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);formEntity.setContentEncoding(charset);httpPost.setEntity(formEntity);}} catch (UnsupportedEncodingException e) {e.printStackTrace();}return executeRequest(httpPost, charset);}/*** http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)*/public static String ajaxPostJson(String url, String jsonString) {return ajaxPostJson(url, jsonString, "UTF-8");}/*** http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)*/public static String ajaxPostJson(String url, String jsonString, String charset) {HttpPost httpPost = new HttpPost(url);httpPost.setHeader("X-Requested-With", "XMLHttpRequest");// try {StringEntity stringEntity = new StringEntity(jsonString, charset);// 解決中文亂碼問題stringEntity.setContentEncoding(charset);stringEntity.setContentType("application/json");httpPost.setEntity(stringEntity);// } catch (UnsupportedEncodingException e) {// e.printStackTrace();// }return executeRequest(httpPost, charset);}/*** 執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)*/public static String executeRequest(HttpUriRequest httpRequest) {return executeRequest(httpRequest, "UTF-8");}/*** 執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)*/public static String executeRequest(HttpUriRequest httpRequest, String charset) {CloseableHttpClient httpclient;if ("https".equals(httpRequest.getURI().getScheme())) {httpclient = createSSLInsecureClient();} else {httpclient = HttpClients.createDefault();}String result = "";try {try {CloseableHttpResponse response = httpclient.execute(httpRequest);HttpEntity entity = null;try {entity = response.getEntity();result = EntityUtils.toString(entity, charset);} finally {EntityUtils.consume(entity);response.close();}} finally {httpclient.close();}} catch (IOException ex) {ex.printStackTrace();}return result;}public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) {String result = "";try {try {CloseableHttpResponse response = httpclient.execute(httpRequest);HttpEntity entity = null;try {entity = response.getEntity();result = EntityUtils.toString(entity, charset);} finally {EntityUtils.consume(entity);response.close();}} finally {httpclient.close();}} catch (IOException ex) {ex.printStackTrace();}return result;}/*** 創(chuàng)建 SSL連接*/public static CloseableHttpClient createSSLInsecureClient() {try {SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {return true;}}).build();SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() {public boolean verify(String hostname, SSLSession session) {return true;}});return HttpClients.custom().setSSLSocketFactory(sslsf).build();} catch (GeneralSecurityException ex) {throw new RuntimeException(ex);}}}
運行
PS:如果覺得我的分享不錯,歡迎大家隨手點贊、在看。 END
評論
圖片
表情






