1. <strong id="7actg"></strong>
    2. <table id="7actg"></table>

    3. <address id="7actg"></address>
      <address id="7actg"></address>
      1. <object id="7actg"><tt id="7actg"></tt></object>

        Spring Boot + WebMagic 實現(xiàn)網頁爬蟲,寫得太好了!

        共 347字,需瀏覽 1分鐘

         ·

        2021-09-13 17:54

        點擊關注公眾號,Java干貨及時送達

        來源:www.jianshu.com/p/cfead4b3e34e

        WebMagic是一個開源的java爬蟲框架。

        WebMagic框架的使用并不是本文的重點,具體如何使用請參考官方文檔:http://webmagic.io/docs/。

        本文是對spring boot+WebMagic+MyBatis做了整合,使用WebMagic爬取數(shù)據(jù),然后通過MyBatis持久化爬取的數(shù)據(jù)到mysql數(shù)據(jù)庫。

        本文提供的源代碼可以作為java爬蟲項目的腳手架。

        1.添加maven依賴

        <?xml version="1.0" encoding="UTF-8"?>
        <project xmlns="http://maven.apache.org/POM/4.0.0"
                 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
            <modelVersion>4.0.0</modelVersion>

            <groupId>hyzx</groupId>
            <artifactId>qbasic-crawler</artifactId>
            <version>1.0.0</version>

            <parent>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-parent</artifactId>
                <version>1.5.21.RELEASE</version>
                <relativePath/> <!-- lookup parent from repository -->
            </parent>

            <properties>
                <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
                <maven.test.skip>true</maven.test.skip>
                <java.version>1.8</java.version>
                <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
                <maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>

                <mysql.connector.version>5.1.47</mysql.connector.version>
                <druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>
                <mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>
                <fastjson.version>1.2.58</fastjson.version>
                <commons.lang3.version>3.9</commons.lang3.version>
                <joda.time.version>2.10.2</joda.time.version>
                <webmagic.core.version>0.7.3</webmagic.core.version>
            </properties>

            <dependencies>
                <dependency>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-devtools</artifactId>
                    <scope>runtime</scope>
                    <optional>true</optional>
                </dependency>

                <dependency>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-starter-test</artifactId>
                    <scope>test</scope>
                </dependency>

                <dependency>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-configuration-processor</artifactId>
                    <optional>true</optional>
                </dependency>

                <dependency>
                    <groupId>mysql</groupId>
                    <artifactId>mysql-connector-java</artifactId>
                    <version>${mysql.connector.version}</version>
                </dependency>

                <dependency>
                    <groupId>com.alibaba</groupId>
                    <artifactId>druid-spring-boot-starter</artifactId>
                    <version>${druid.spring.boot.starter.version}</version>
                </dependency>

                <dependency>
                    <groupId>org.mybatis.spring.boot</groupId>
                    <artifactId>mybatis-spring-boot-starter</artifactId>
                    <version>${mybatis.spring.boot.starter.version}</version>
                </dependency>

                <dependency>
                    <groupId>com.alibaba</groupId>
                    <artifactId>fastjson</artifactId>
                    <version>${fastjson.version}</version>
                </dependency>

                <dependency>
                    <groupId>org.apache.commons</groupId>
                    <artifactId>commons-lang3</artifactId>
                    <version>${commons.lang3.version}</version>
                </dependency>

                <dependency>
                    <groupId>joda-time</groupId>
                    <artifactId>joda-time</artifactId>
                    <version>${joda.time.version}</version>
                </dependency>

                <dependency>
                    <groupId>us.codecraft</groupId>
                    <artifactId>webmagic-core</artifactId>
                    <version>${webmagic.core.version}</version>
                    <exclusions>
                        <exclusion>
                            <groupId>org.slf4j</groupId>
                            <artifactId>slf4j-log4j12</artifactId>
                        </exclusion>
                    </exclusions>
                </dependency>
            </dependencies>

            <build>
                <plugins>
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-compiler-plugin</artifactId>
                        <version>${maven.compiler.plugin.version}</version>
                        <configuration>
                            <source>${java.version}</source>
                            <target>${java.version}</target>
                            <encoding>${project.build.sourceEncoding}</encoding>
                        </configuration>
                    </plugin>

                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-resources-plugin</artifactId>
                        <version>${maven.resources.plugin.version}</version>
                        <configuration>
                            <encoding>${project.build.sourceEncoding}</encoding>
                        </configuration>
                    </plugin>

                    <plugin>
                        <groupId>org.springframework.boot</groupId>
                        <artifactId>spring-boot-maven-plugin</artifactId>
                        <configuration>
                            <fork>true</fork>
                            <addResources>true</addResources>
                        </configuration>
                        <executions>
                            <execution>
                                <goals>
                                    <goal>repackage</goal>
                                </goals>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>

            <repositories>
                <repository>
                    <id>public</id>
                    <name>aliyun nexus</name>
                    <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
                    <releases>
                        <enabled>true</enabled>
                    </releases>
                </repository>
            </repositories>

            <pluginRepositories>
                <pluginRepository>
                    <id>public</id>
                    <name>aliyun nexus</name>
                    <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
                    <releases>
                        <enabled>true</enabled>
                    </releases>
                    <snapshots>
                        <enabled>false</enabled>
                    </snapshots>
                </pluginRepository>
            </pluginRepositories>
        </project>

        2.項目配置文件 application.properties

        配置mysql數(shù)據(jù)源,druid數(shù)據(jù)庫連接池以及MyBatis的mapper文件的位置。Spring Boot 基礎就不介紹了,最全教程和示例源碼推薦看這里:https://github.com/javastacks/spring-boot-best-practice

        # mysql數(shù)據(jù)源配置
        spring.datasource.name=mysql
        spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
        spring.datasource.driver-class-name=com.mysql.jdbc.Driver
        spring.datasource.url=jdbc:mysql://192.168.0.63:3306/gjhzjl?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true
        spring.datasource.username=root
        spring.datasource.password=root

        # druid數(shù)據(jù)庫連接池配置
        spring.datasource.druid.initial-size=5
        spring.datasource.druid.min-idle=5
        spring.datasource.druid.max-active=10
        spring.datasource.druid.max-wait=60000
        spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
        spring.datasource.druid.test-on-borrow=false
        spring.datasource.druid.test-on-return=false
        spring.datasource.druid.test-while-idle=true
        spring.datasource.druid.time-between-eviction-runs-millis=60000
        spring.datasource.druid.min-evictable-idle-time-millis=300000
        spring.datasource.druid.max-evictable-idle-time-millis=600000

        # mybatis配置
        mybatis.mapperLocations=classpath:mapper/**/*.xml

        3.數(shù)據(jù)庫表結構

        CREATE TABLE `cms_content` (
          `contentId` varchar(40) NOT NULL COMMENT '內容ID',
          `title` varchar(150) NOT NULL COMMENT '標題',
          `content` longtext COMMENT '文章內容',
          `releaseDate` datetime NOT NULL COMMENT '發(fā)布日期',
          PRIMARY KEY (`contentId`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS內容表';

        4.實體類

        import java.util.Date;

        public class CmsContentPO {
            private String contentId;

            private String title;

            private String content;

            private Date releaseDate;

            public String getContentId() {
                return contentId;
            }

            public void setContentId(String contentId) {
                this.contentId = contentId;
            }

            public String getTitle() {
                return title;
            }

            public void setTitle(String title) {
                this.title = title;
            }

            public String getContent() {
                return content;
            }

            public void setContent(String content) {
                this.content = content;
            }

            public Date getReleaseDate() {
                return releaseDate;
            }

            public void setReleaseDate(Date releaseDate) {
                this.releaseDate = releaseDate;
            }
        }

        5.mapper接口

        public interface CrawlerMapper {
            int addCmsContent(CmsContentPO record);
        }

        6.CrawlerMapper.xml文件

        <?xml version="1.0" encoding="UTF-8"?>
        <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
        <mapper namespace="com.hyzx.qbasic.dao.CrawlerMapper">

            <insert id="addCmsContent" parameterType="com.hyzx.qbasic.model.CmsContentPO">
                insert into cms_content (contentId,
                                         title,
                                         releaseDate,
                                         content)
                values (#{contentId,jdbcType=VARCHAR},
                        #{title,jdbcType=VARCHAR},
                        #{releaseDate,jdbcType=TIMESTAMP},
                        #{content,jdbcType=LONGVARCHAR})
            </insert>
        </mapper>

        7.XXX頁面內容處理類XXXPageProcessor

        主要用于解析爬取到的XXX html頁面。

        點擊關注公眾號,Java干貨及時送達

        @Component
        public class XXXPageProcessor implements PageProcessor {

            private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

            @Override
            public void process(Page page) {
                page.addTargetRequests(page.getHtml().links().regex("https://www\\.xxx\\.com/question/\\d+/answer/\\d+.*").all());
                page.putField("title", page.getHtml().xpath("http://h1[@class='QuestionHeader-title']/text()").toString());
                page.putField("answer", page.getHtml().xpath("http://div[@class='QuestionAnswer-content']/tidyText()").toString());
                if (page.getResultItems().get("title") == null) {
                    // 如果是列表頁,跳過此頁,pipeline不進行后續(xù)處理
                    page.setSkip(true);
                }
            }

            @Override
            public Site getSite() {
                return site;
            }
        }

        8.XXX數(shù)據(jù)處理類XXXPipeline

        主要用于將XXX html頁面解析出的數(shù)據(jù)存儲到mysql數(shù)據(jù)庫。另外,MySQL 系列面試題和答案全部整理好了,微信搜索Java技術棧,在后臺發(fā)送:面試,可以在線閱讀。

        @Component
        public class XXXPipeline implements Pipeline {
            private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);

            @Autowired
            private CrawlerMapper crawlerMapper;

            public void process(ResultItems resultItems, Task task) {
                String title = resultItems.get("title");
                String answer = resultItems.get("answer");

                CmsContentPO contentPO = new CmsContentPO();
                contentPO.setContentId(UUID.randomUUID().toString());
                contentPO.setTitle(title);
                contentPO.setReleaseDate(new Date());
                contentPO.setContent(answer);

                try {
                    boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
                    LOGGER.info("保存文章成功:{}", title);
                } catch (Exception ex) {
                    LOGGER.error("保存文章失敗", ex);
                }
            }
        }

        9.爬蟲任務類XXXTask

        每十分鐘啟動一次爬蟲。

        @Component
        public class XXXTask {
            private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);

            @Autowired
            private XXXPipeline XXXPipeline;

            @Autowired
            private XXXPageProcessor xxxPageProcessor;

            private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();

            public void crawl() {
                // 定時任務,每10分鐘爬取一次
                timer.scheduleWithFixedDelay(() -> {
                    Thread.currentThread().setName("xxxCrawlerThread");

                    try {
                        Spider.create(xxxPageProcessor)
                                // 從https://www.xxx.com/explore開始抓
                                .addUrl("https://www.xxx.com/explore")
                                // 抓取到的數(shù)據(jù)存數(shù)據(jù)庫
                                .addPipeline(xxxPipeline)
                                // 開啟2個線程抓取
                                .thread(2)
                                // 異步啟動爬蟲
                                .start();
                    } catch (Exception ex) {
                        LOGGER.error("定時抓取數(shù)據(jù)線程執(zhí)行異常", ex);
                    }
                }, 0, 10, TimeUnit.MINUTES);
            }
        }

        10.Spring boot程序啟動類

        @SpringBootApplication
        @MapperScan(basePackages = "com.hyzx.qbasic.dao")
        public class Application implements CommandLineRunner {

            @Autowired
            private XXXTask xxxTask;

            public static void main(String[] args) throws IOException {
                SpringApplication.run(Application.class, args);
            }

            @Override
            public void run(String... strings) throws Exception {
                // 爬取數(shù)據(jù)
                xxxTask.crawl();
            }
        }

        最后,關注公眾號Java技術棧,在后臺回復:面試,可以獲取我整理的 Java/ Spring Boot 系列面試題和答案,非常齊全。






        關注Java技術??锤喔韶?/strong>



        獲取 Spring Boot 實戰(zhàn)筆記!

        瀏覽 59
        點贊
        評論
        收藏
        分享

        手機掃一掃分享

        分享
        舉報
        評論
        圖片
        表情
        推薦
        點贊
        評論
        收藏
        分享

        手機掃一掃分享

        分享
        舉報
        1. <strong id="7actg"></strong>
        2. <table id="7actg"></table>

          <address id="7actg"></address>
          <address id="7actg"></address>
          1. <object id="7actg"><tt id="7actg"></tt></object>
            黄色国产一级片 | 99久久免费看精品国产一区 | 久久亚洲精品国产美女写真集下载 | 永久免费处破女A片 | 日本欧美视频网站 | 青青草原网| 9l人人澡人人妻人人精品 | 美女18隐私视频网站动漫 | 人人澡超碰碰97碰碰碰软件 | 九九热精品国产 |