spring 整合 mybatis+webmagic爬取数据并持
2017-02-10 本文已影响3107人
freelands
因为最近在爬数据进行分析,数据已经爬好了,但最后还是需要持久化到数据库。因为公司用的持久化框架是mybatis,这里面又不需要mvc的架构,所以只需要spring 和 mybatis进行整合就行了,spring 作为bean容器,mybatis负责orm映射和持久化。
我这边用的是gradle构建工具,下面是我的依赖:
compile 'us.codecraft:webmagic-core:0.5.3'
compile('us.codecraft:webmagic-extension:0.5.3')
compile 'org.seleniumhq.selenium:selenium-java:2.8.0'
compile group: 'us.codecraft', name: 'webmagic-selenium', version: '0.5.2'
compile 'com.github.detro:phantomjsdriver:1.2.0'
testCompile group: 'junit', name: 'junit', version: '4.11'
compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.8.5'
compile 'org.springframework:spring-aop:4.2.4.RELEA SE'
compile 'org.springframework:spring-context:4.2.4.RELEASE'
compile 'org.springframework:spring-beans:4.2.4.RELEASE'
compile 'org.springframework:spring-web:4.2.4.RELEASE'
compile 'org.springframework:spring-webmvc:4.2.4.RELEASE'
compile 'org.springframework:spring-tx:4.2.4.RELEASE'
compile 'org.springframework:spring-jdbc:4.2.4.RELEASE'
compile 'org.springframework:spring-test:4.2.4.RELEASE'
compile 'mysql:mysql-connector-java:5.1.38'
compile 'org.mybatis.generator:mybatis-generator-core:1.3.2'
compile 'org.mybatis:mybatis-spring:1.2.3'
compile 'org.mybatis:mybatis:3.3.0'
compile group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.6.2'
compile group: 'org.apache.commons', name: 'commons-dbcp2', version: '2.1.1'
compile group: 'org.projectlombok', name: 'lombok', version: '1.16.10'
接下来是实体类PO(基金):
@Data
@Builder
public class Fund{
private int id;
private String fundCode;
private String fundName;
private String dailyGrowthRate;
private String monthlyGrowthRate;
}
数据库的schema如下:
CREATE TABLE `fund` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`fund_code` varchar(255) DEFAULT NULL,
`fund_name` varchar(255) DEFAULT NULL,
`daily_growth_rate` varchar(255) DEFAULT NULL,
`monthly_growth_rate` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=11594 DEFAULT CHARSET=utf8;
然后是UserMapper:
public interface FundMapper {
int insert(Fund fund);
}
然后是业务类UserService:
其中@Service注解配合ComponentScan会把这个类注入Spring容器
@Autowired 是按照类型进行装配
@Service
public class FundService {
@Autowired
private FundMapper mapper;
public int insert(Fund fund){
return mapper.insert(fund);
}
}
接下来是UserMapper.xml文件:
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="stock.mapper.FundMapper">
<resultMap id="BaseResultMap" type="stock.po.Fund">
<id column="id" property="id" jdbcType="INTEGER"/>
<result column="fund_code" property="fundCode" jdbcType="VARCHAR"/>
<result column="fund_name" property="fundName" jdbcType="VARCHAR"/>
<result column="daily_growth_rate" property="dailyGrowthRate" jdbcType="VARCHAR"/>
<result column="monthly_growth_rate" property="monthlyGrowthRate" jdbcType="VARCHAR"/>
</resultMap>
<sql id="BaseColumnList">
id,fund_code,fund_name,daily_growth_rate,monthly_growth_rate
</sql>
<insert id="insert" parameterType="stock.po.Fund">
INSERT INTO fund(
<include refid="BaseColumnList"/>
)
VALUES (
#{id,jdbcType=INTEGER},
#{fundCode,jdbcType=VARCHAR},
#{fundName,jdbcType=VARCHAR},
#{dailyGrowthRate,jdbcType=VARCHAR},
#{monthlyGrowthRate,jdbcType=VARCHAR}
)
</insert>
</mapper>
接着是mybatis的配置文件:
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<mappers>
<mapper resource="mapper/FundMapper.xml"/>
</mappers>
</configuration>
然后就是spring的配置文件:
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:p="http://www.springframework.org/schema/p"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:mvc="http://www.springframework.org/schema/mvc"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-4.0.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-4.0.xsd
http://www.springframework.org/schema/mvc
http://www.springframework.org/schema/mvc/spring-mvc-4.0.xsd">
<!-- 加载配置文件 -->
<context:property-placeholder location="classpath:jdbc.properties"/>
<context:component-scan base-package="stock.**"/>
<!-- 数据源,使用dbcp -->
<bean id="dataSource" class="org.apache.commons.dbcp2.BasicDataSource" destroy-method="close">
<property name="driverClassName" value="${jdbc.driver}" /><!-- 这里的name不能直接使用driver,必须是driverClassName -->
<property name="url" value="${jdbc.url}" />
<property name="username" value="${jdbc.username}" />
<property name="password" value="${jdbc.password}" />
</bean>
<!-- sqlSessionFactory -->
<bean id = "sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
<!-- 加载mybatis的配置文件 -->
<property name="configLocation" value="mybatis-config.xml"></property>
<!-- 数据源 -->
<property name="dataSource" ref="dataSource"></property>
</bean>
<!-- mapper配置,MapperFactoryBean可以根据mapper接口来生成代理对象 -->
<bean id="fundMapper" class="org.mybatis.spring.mapper.MapperFactoryBean">
<property name="mapperInterface" value="stock.mapper.FundMapper"/>
<property name="sqlSessionFactory" ref="sqlSessionFactory"/>
</bean>
</beans>
其中jdbc.properties的文件如下:
jdbc.driver=com.mysql.jdbc.Driver
jdbc.url=jdbc:mysql://127.0.0.1:3306/test?useUnicode=true&characterEncoding=utf8
jdbc.username=root
jdbc.password=
然后逻辑代码如下:
public class NewFundProcessor implements PageProcessor {
private Logger log = LoggerFactory.getLogger(NewFundProcessor.class);
private ApplicationContext context;
public NewFundProcessor() {
context = new ClassPathXmlApplicationContext("classpath:applicationContext.xml");
}
private FundService fundService;
private static final String prefix = "https://e.lufunds.com/jijin/allFund?subType=&haitongGrade=&fundGroupId=¤tPage=";
private static final String suffix = "&orderType=twelve_month_increase_desc&canFixInvest=&searchWord=#sortTab";
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(3000)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36");
@Override
public void process(Page page) {
System.out.println("first ------------------");
List<String> list = page.getHtml().xpath("div[@class='listTable']/table[@id='fundTable']/tbody/tr").all();
for (int i = 0; i < list.size(); i++) {
Html h = new Html(list.get(i).replace("td", "div"));
String fundCode = h.xpath("//div[1]/text()").get();
String fundName = h.xpath("//div[2]/a/text()").get();
String dailyGrowthRate = h.xpath("//div[4]/span/text()").get();
String monthGrowthRate = h.xpath("//div[5]/span/text()").get();
String startAmount = h.xpath("//div[10]/text()").get();
System.out.println("基金代码:" + h.xpath("//div[1]/text()"));
System.out.println("基金简介:" + h.xpath("//div[2]/a/text()"));
System.out.println("最新净值:" + h.xpath("//div[3]/p[1]/text()"));
System.out.println("时间:" + h.xpath("//p[2]/text()"));
System.out.println("日增长率:" + h.xpath("//div[4]/span/text()"));
System.out.println("最近一月增长率:" + h.xpath("//div[5]/span/text()"));
System.out.println("最近三月增长率:" + h.xpath("//div[6]/span/text()"));
System.out.println("最近一年增长率:" + h.xpath("//div[7]/span/text()"));
System.out.println("今年增长率:" + h.xpath("//div[8]/span/text()"));
System.out.println("成立以来增长率:" + h.xpath("//div[9]/span/text()"));
System.out.println("起投金额:" + h.xpath("//div[10]/text()"));
fundService = (FundService) context.getBean("fundService");
Fund fund = new Fund();
fund.setFundCode(fundCode);
fund.setFundName(fundName);
fund.setDailyGrowthRate(dailyGrowthRate);
fund.setMonthlyGrowthRate(monthGrowthRate);
int result = fundService.insert(fund);
System.out.println(result);
System.out.println("-------");
}
System.out.println("size:" + list.size());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
List<String> urls = new ArrayList<String>();
for (int i = 1; i <= 250; i++) {
String url = prefix+i+suffix;
urls.add(url);
}
NewFundProcessor processor = new NewFundProcessor();
Spider.create(processor)
.startUrls(urls)
.thread(10)
.runAsync();
}
}
执行之后会往数据库插入3000多条基金的数据:

