1. <strong id="7actg"></strong>
    2. <table id="7actg"></table>

    3. <address id="7actg"></address>
      <address id="7actg"></address>
      1. <object id="7actg"><tt id="7actg"></tt></object>

        Java爬蟲下載千張美女圖片!

        共 2269字,需瀏覽 5分鐘

         ·

        2021-02-10 16:44

        來源:https://blog.csdn.net/qq_35402412



        目的

        爬取搜狗圖片上千張美女圖片并下載到本地

        準備工作

        爬取地址:https://pic.sogou.com/pics?query=%E7%BE%8E%E5%A5%B3

        分析

        打開上面的地址,按F12開發(fā)者工具 - NetWork - XHR - 頁面往下滑動XHR欄出現(xiàn)請求信息如下:

        Request URL :?

        https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=%E7%BE%8E%E5%A5%B3

        分析這段請求URL的主要幾個參數(shù):

        start=48? 表示從第48張圖片開始檢索

        xml_len=48? 從地48張往后獲取48張圖片

        query=?? ? 搜索關(guān)鍵詞(例:美女,這里瀏覽器自動做了轉(zhuǎn)碼,不影響我們使用)

        點擊Respose,找個JSON格式器輔助過去看看。

        JSON格式:?https://www.bejson.com/

        分析Respose返回的信息,可以發(fā)現(xiàn)我們想要的圖片地址放在 picUrl里,

        思路

        通過以上分析,不難實現(xiàn)下載方法,思路如下:

        1. 設(shè)置URL請求參數(shù)

        2. 訪問URL請求,獲取圖片地址

        3. 圖片地址存入List

        4. 遍歷List,使用線程池下載到本地

        代碼

        SougouImgProcessor.java?爬取圖片類

        import com.alibaba.fastjson.JSONObject;import us.codecraft.webmagic.utils.HttpClientUtils;import victor.chang.crawler.pipeline.SougouImgPipeline; import java.util.ArrayList;import java.util.List; /** * A simple PageProcessor. * * @author code4crafter@gmail.com 
        * @since 0.1.0 */public class SougouImgProcessor { private String url; private SougouImgPipeline pipeline; private List dataList; private List urlList; private String word; public SougouImgProcessor(String url,String word) { this.url = url; this.word = word; this.pipeline = new SougouImgPipeline(); this.dataList = new ArrayList<>(); this.urlList = new ArrayList<>(); } public void process(int idx, int size) { String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word)); JSONObject object = JSONObject.parseObject(res); List items = (List)((JSONObject)object.get("data")).get("items"); for(JSONObject item : items){ this.urlList.add(item.getString("picUrl")); } this.dataList.addAll(items); } // 下載 public void pipelineData(){// pipeline.process(this.urlList, word); // 單線程 pipeline.processSync(this.urlList, this.word); // 多線程 } public static void main(String[] args) { String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s"; SougouImgProcessor processor = new SougouImgProcessor(url,"美女"); int start = 0, size = 50, limit = 1000; // 定義爬取開始索引、每次爬取數(shù)量、總共爬取數(shù)量 for(int i=start;i processor.process(i, size); processor.pipelineData(); } }

        SougouImgPipeline.java? ?圖片下載類

        import?java.io.File;import java.io.FileOutputStream;import java.io.InputStream;import java.net.URL;import java.net.URLConnection;import java.util.List;import java.util.Objects;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import java.util.concurrent.TimeUnit;import java.util.concurrent.atomic.AtomicInteger; /** * Store results in files.
        * * @author [email protected]
        * @since 0.1.0 */public class SougouImgPipeline { private String extension = ".jpg"; private String path; private volatile AtomicInteger suc; private volatile AtomicInteger fails; public SougouImgPipeline() { setPath("E:/pipeline/sougou"); suc = new AtomicInteger(); fails = new AtomicInteger(); } public SougouImgPipeline(String path) { setPath(path); suc = new AtomicInteger(); fails = new AtomicInteger(); } public SougouImgPipeline(String path, String extension) { setPath(path); this.extension = extension; suc = new AtomicInteger(); fails = new AtomicInteger(); } public void setPath(String path) { this.path = path; } /** * 下載 * * @param url * @param cate * @throws Exception */ private void downloadImg(String url, String cate, String name) throws Exception { String path = this.path + "/" + cate + "/"; File dir = new File(path); if (!dir.exists()) { // 目錄不存在則創(chuàng)建目錄 dir.mkdirs(); } String realExt = url.substring(url.lastIndexOf(".")); // 獲取擴展名 String fileName = name + realExt; fileName = fileName.replace("-", ""); String filePath = path + fileName; File img = new File(filePath); if(img.exists()){ // 若文件之前已經(jīng)下載過,則跳過 System.out.println(String.format("文件%s已存在本地目錄",fileName)); return; } URLConnection con = new URL(url).openConnection(); con.setConnectTimeout(5000); con.setReadTimeout(5000); InputStream inputStream = con.getInputStream(); byte[] bs = new byte[1024]; File file = new File(filePath); FileOutputStream os = new FileOutputStream(file, true); // 開始讀取 寫入 int len; while ((len = inputStream.read(bs)) != -1) { os.write(bs, 0, len); } System.out.println("picUrl: " + url); System.out.println(String.format("正在下載第%s張圖片", suc.getAndIncrement())); } /** * 單線程處理 * * @param data * @param word */ public void process(List data, String word) { long start = System.currentTimeMillis(); for (String picUrl : data) { if (picUrl == null) continue; try { downloadImg(picUrl, word, picUrl); } catch (Exception e) {// e.printStackTrace(); fails.incrementAndGet(); } } System.out.println("下載成功: " + suc.get()); System.out.println("下載失敗: " + fails.get()); long end = System.currentTimeMillis(); System.out.println("耗時:" + (end - start) / 1000 + "秒"); } /** * 多線程處理 * * @param data * @param word */ public void processSync(List data, String word) { long start = System.currentTimeMillis(); int count = 0; ExecutorService executorService = Executors.newCachedThreadPool(); // 創(chuàng)建緩存線程池 for (int i=0;i String picUrl = data.get(i); if (picUrl == null) continue; String name = ""; if(i<10){ name="000"+i; }else if(i<100){ name="00"+i; }else if(i<1000){ name="0"+i; } String finalName = name; executorService.execute(() -> { try { downloadImg(picUrl, word, finalName); } catch (Exception e) {// e.printStackTrace(); fails.incrementAndGet(); } }); count++; } executorService.shutdown(); try { if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) { // 超時的時候向線程池中所有的線程發(fā)出中斷(interrupted)。 // executorService.shutdownNow(); } System.out.println("AwaitTermination Finished"); System.out.println("共有URL: "+data.size()); System.out.println("下載成功: " + suc); System.out.println("下載失敗: " + fails); File dir = new File(this.path + "/" + word + "/"); int len = Objects.requireNonNull(dir.list()).length; System.out.println("當前共有文件:"+len); long end = System.currentTimeMillis(); System.out.println("耗時:" + (end - start) / 1000.0 + "秒"); } catch (InterruptedException e) { e.printStackTrace(); } } /** * 多線程分段處理 * * @param data * @param word * @param threadNum */ public void processSync2(List data, final String word, int threadNum) { if (data.size() < threadNum) { process(data, word); } else { ExecutorService executorService = Executors.newCachedThreadPool(); int num = data.size() / threadNum; //每段要處理的數(shù)量 for (int i = 0; i < threadNum; i++) { int start = i * num; int end = (i + 1) * num; if (i == threadNum - 1) { end = data.size(); } final List cutList = data.subList(start, end); executorService.execute(() -> process(cutList, word)); } executorService.shutdown(); } } }

        HttpClientUtils.java? ? http請求工具類

        import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.NameValuePair;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.methods.HttpUriRequest;import org.apache.http.conn.ssl.SSLConnectionSocketFactory;import org.apache.http.conn.ssl.TrustStrategy;import org.apache.http.entity.StringEntity;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.message.BasicNameValuePair;import org.apache.http.ssl.SSLContextBuilder;import org.apache.http.util.EntityUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory; import javax.net.ssl.HostnameVerifier;import javax.net.ssl.SSLContext;import javax.net.ssl.SSLSession;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.security.GeneralSecurityException;import java.security.cert.CertificateException;import java.security.cert.X509Certificate;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map; /** * @author [email protected] * Date: 17/3/27 */public abstract class HttpClientUtils {     public static Map<String, List<String>> convertHeaders(Header[] headers) {        Map<String, List<String>> results = new HashMap<String, List<String>>();        for (Header header : headers) {            List<String> list = results.get(header.getName());            if (list == null) {                list = new ArrayList<String>();                results.put(header.getName(), list);            }            list.add(header.getValue());        }        return results;    }     /**     * http的get請求     *     * @param url     */    public static String get(String url) {        return get(url, "UTF-8");    }     public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);     /**     * http的get請求     *     * @param url     */    public static String get(String url, String charset) {        HttpGet httpGet = new HttpGet(url);        return executeRequest(httpGet, charset);    }     /**     * http的get請求,增加異步請求頭參數(shù)     *     * @param url     */    public static String ajaxGet(String url) {        return ajaxGet(url, "UTF-8");    }     /**     * http的get請求,增加異步請求頭參數(shù)     *     * @param url     */    public static String ajaxGet(String url, String charset) {        HttpGet httpGet = new HttpGet(url);        httpGet.setHeader("X-Requested-With", "XMLHttpRequest");        return executeRequest(httpGet, charset);    }     /**     * @param url     * @return     */    public static String ajaxGet(CloseableHttpClient httpclient, String url) {        HttpGet httpGet = new HttpGet(url);        httpGet.setHeader("X-Requested-With", "XMLHttpRequest");        return executeRequest(httpclient, httpGet, "UTF-8");    }     /**     * http的post請求,傳遞map格式參數(shù)     */    public static String post(String url, Map<String, String> dataMap) {        return post(url, dataMap, "UTF-8");    }     /**     * http的post請求,傳遞map格式參數(shù)     */    public static String post(String url, Map<String, String> dataMap, String charset) {        HttpPost httpPost = new HttpPost(url);        try {            if (dataMap != null) {                List nvps = new ArrayList();                for (Map.Entry<String, String> entry : dataMap.entrySet()) {                    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));                }                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);                formEntity.setContentEncoding(charset);                httpPost.setEntity(formEntity);            }        } catch (UnsupportedEncodingException e) {            e.printStackTrace();        }        return executeRequest(httpPost, charset);    }     /**     * http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)     */    public static String ajaxPost(String url, Map<String, String> dataMap) {        return ajaxPost(url, dataMap, "UTF-8");    }     /**     * http的post請求,增加異步請求頭參數(shù),傳遞map格式參數(shù)     */    public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {        HttpPost httpPost = new HttpPost(url);        httpPost.setHeader("X-Requested-With", "XMLHttpRequest");        try {            if (dataMap != null) {                List nvps = new ArrayList();                for (Map.Entry<String, String> entry : dataMap.entrySet()) {                    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));                }                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);                formEntity.setContentEncoding(charset);                httpPost.setEntity(formEntity);            }        } catch (UnsupportedEncodingException e) {            e.printStackTrace();        }        return executeRequest(httpPost, charset);    }     /**     * http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)     */    public static String ajaxPostJson(String url, String jsonString) {        return ajaxPostJson(url, jsonString, "UTF-8");    }     /**     * http的post請求,增加異步請求頭參數(shù),傳遞json格式參數(shù)     */    public static String ajaxPostJson(String url, String jsonString, String charset) {        HttpPost httpPost = new HttpPost(url);        httpPost.setHeader("X-Requested-With", "XMLHttpRequest");//    try {        StringEntity stringEntity = new StringEntity(jsonString, charset);// 解決中文亂碼問題        stringEntity.setContentEncoding(charset);        stringEntity.setContentType("application/json");        httpPost.setEntity(stringEntity);//    } catch (UnsupportedEncodingException e) {//      e.printStackTrace();//    }        return executeRequest(httpPost, charset);    }     /**     * 執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)     */    public static String executeRequest(HttpUriRequest httpRequest) {        return executeRequest(httpRequest, "UTF-8");    }     /**     * 執(zhí)行一個http請求,傳遞HttpGet或HttpPost參數(shù)     */    public static String executeRequest(HttpUriRequest httpRequest, String charset) {        CloseableHttpClient httpclient;        if ("https".equals(httpRequest.getURI().getScheme())) {            httpclient = createSSLInsecureClient();        } else {            httpclient = HttpClients.createDefault();        }        String result = "";        try {            try {                CloseableHttpResponse response = httpclient.execute(httpRequest);                HttpEntity entity = null;                try {                    entity = response.getEntity();                    result = EntityUtils.toString(entity, charset);                } finally {                    EntityUtils.consume(entity);                    response.close();                }            } finally {                httpclient.close();            }        } catch (IOException ex) {            ex.printStackTrace();        }        return result;    }     public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) {        String result = "";        try {            try {                CloseableHttpResponse response = httpclient.execute(httpRequest);                HttpEntity entity = null;                try {                    entity = response.getEntity();                    result = EntityUtils.toString(entity, charset);                } finally {                    EntityUtils.consume(entity);                    response.close();                }            } finally {                httpclient.close();            }        } catch (IOException ex) {            ex.printStackTrace();        }        return result;    }     /**     * 創(chuàng)建 SSL連接     */    public static CloseableHttpClient createSSLInsecureClient() {        try {            SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {                @Override                public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {                    return true;                }            }).build();            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() {                @Override                public boolean verify(String hostname, SSLSession session) {                    return true;                }            });            return HttpClients.custom().setSSLSocketFactory(sslsf).build();        } catch (GeneralSecurityException ex) {            throw new RuntimeException(ex);        }    }}

        運行

        由于網(wǎng)絡(luò)等原因,我們發(fā)現(xiàn)并不能全部下載成功,不過可以多次運行嘗試,可以實現(xiàn)較高的下載成功率。

        PS:如果覺得我的分享不錯,歡迎大家隨手點贊、在看。
        END
        瀏覽 140
        點贊
        評論
        收藏
        分享

        手機掃一掃分享

        分享
        舉報
        評論
        圖片
        表情
        推薦
        點贊
        評論
        收藏
        分享

        手機掃一掃分享

        分享
        舉報
        1. <strong id="7actg"></strong>
        2. <table id="7actg"></table>

        3. <address id="7actg"></address>
          <address id="7actg"></address>
          1. <object id="7actg"><tt id="7actg"></tt></object>
            中文字幕第9页 | 成人毛片女人28 | 淫操极品| 日韩性爱一区二区 | 国产中文字幕乱人伦在线观看 | 白虎穴视频| 97精品久久不卡网 | 男女强吻摸下面揉免费 | 亚洲午夜成人视频 | 欧美性爱综合 |