编写爬虫抓取房源数据
开发搜索房源接口服务
整合前端开发实现搜索功能
优化搜索功能增加高亮和分页功能
热词推荐功能实现
拼音分词
为了丰富我们的房源数据,所以我们采用WebMagic来抓取一些数据,目标网站是上海链家网。
打开it-es项目
us.codecraft webmagic-core 0.7.3 us.codecraft webmagic-extension 0.7.3 commons-io commons-io 2.6
我们先写一个官方demo来快速学习下webmagic.
package cn.itcast.es.wm;import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;/*** 官方实例demo*/
public class GithubRepoPageProcessor implements PageProcessor {// 爬虫策略private Site site = Site.me().setRetryTimes(3) // 下次失败后,重试三次.setSleepTime(1000) // 每隔一秒,请求下一个网页.setTimeOut(10000); // 下载10s则超时失败.@Overridepublic void process(Page page) {// 新增解析多个目标网址page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());// 目标解析域为 https://github.com/(\\w+)/的所有样式page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());// 当前页面的h1并且样式为class='entry-title public'下的标签下的标签里的文字.page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());// 跳过分页列表所在的页(就是分页列表页面没有name属性, 只有详情页才有name属性)if (page.getResultItems().get("name") == null) {//skip this pagepage.setSkip(true);}page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));}@Overridepublic Site getSite() {return site;}// 哦对了, 运行会报错, 是正常现象, 官方demo就这样.public static void main(String[] args) {// 指定目标网址Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();}
}
多说无益, 我们直接新增一个LianjiaPageProcessor来解析链家网页吧.
目标网页:
https://sh.lianjia.com/zufang/pg1
pg1是第一页, 所以我们改变1->100就是遍历100页了.
package cn.itcast.es.wm;import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;import java.util.List;/*** 解析上海链家分页租房内容前100页* https://sh.lianjia.com/zufang/pg{1...100}*/
public class LianjiaPageProcessor implements PageProcessor {// 下载失败后重试次数为三次, 每隔200ms发起一次请求.private Site site = Site.me().setRetryTimes(3).setSleepTime(200);@Overridepublic void process(Page page) {Html html = page.getHtml();// 得到房源分页列表中所有的详情链接, 根据css选择器进行选择List all = html.css(".content__list--item--title a").links().all();// 所有详情的链接被加入到下一次爬取内容中page.addTargetRequests(all);// 解析详情页的属性到page中.toResolveFiled(page, html);// 只有入口页没有titleif (page.getResultItems().get("title") == null) {// 不下载本页page.setSkip(true);//分页for (int i = 1; i <= 100; i++) {page.addTargetRequest("https://sh.lianjia.com/zufang/pg" + i);}}}private void toResolveFiled(Page page, Html html) {// 这里采用的统一为css定位解析内容String title = html.xpath("//div[@class='content clear w1150']/p/text()").toString();page.putField("title", title);String rent = html.xpath("//div[@class='content__aside--title']/span/text()").toString();page.putField("rent", rent);String type = html.xpath("//ul[@class='content__aside__list']/allText()").toString();page.putField("type", type);String info = html.xpath("//div[@class='content__article__info']/allText()").toString();page.putField("info", info);String img = html.xpath("//div[@class='content__article__slide__item']/img").toString();page.putField("img", img);
// String printInfo = page.getRequest().getUrl() + "\ntitle = " + title + "\n" + "rent" + rent + "\n"
// + "type" + type + "\n" + "info" + info + "\n" + "img" + img;
// System.out.println(printInfo);}@Overridepublic Site getSite() {return site;}public static void main(String[] args) {// 爬虫标记入口页面,五个线程工作.Spider.create(new LianjiaPageProcessor()).addUrl("https://sh.lianjia.com/zufang/").thread(5)// 对下载好的页面使用自定义的MyPipeline类进行流处理.addPipeline(new MyPipeline()).run();}
}
MyPipeLine类如下:
package cn.itcast.es.wm;import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;import java.io.File;
import java.util.HashMap;
import java.util.Map;public class MyPipeline implements Pipeline {private static final ObjectMapper MAPPER = new ObjectMapper();@Overridepublic void process(ResultItems resultItems, Task task) {Map data = new HashMap<>();data.put("url", resultItems.getRequest().getUrl());data.put("title", resultItems.get("title"));//标题data.put("rent", resultItems.get("rent"));//租金String[] types = StringUtils.split(resultItems.get("type"), ' ');data.put("rentMethod", types[0]);//租赁方式data.put("houseType", types[1]);//户型,如:2室1厅1卫data.put("orientation", types[2]);//朝向String[] infos = StringUtils.split(resultItems.get("info"), ' ');for (String info : infos) {if (StringUtils.startsWith(info, "看房:")) {data.put("time", StringUtils.split(info, ':')[1]);} else if (StringUtils.startsWith(info, "楼层:")) {data.put("floor", StringUtils.split(info, ':')[1]);}}String imageUrl = StringUtils.split(resultItems.get("img"), '"')[3];String newName = StringUtils.substringBefore(StringUtils.substringAfterLast(resultItems.getRequest().getUrl(),"/"), ".") + ".jpg";try {this.downloadFile(imageUrl, new File("F:\\code\\images\\" + newName));data.put("image", newName);String json = MAPPER.writeValueAsString(data);// 写入到F:\code\data.json文件中// 如果找不到write方法,说明你的FileUtils所在的common-ios版本太低, 注意下是否版本冲突了.FileUtils.write(new File("F:\\code\\data.json"), json + "\n", "UTF-8", true);} catch (Exception e) {e.printStackTrace();}}/*** 下载文件** @param url 文件url* @param dest 目标目录* @throws Exception*/public void downloadFile(String url, File dest) throws Exception {HttpGet httpGet = new HttpGet(url);CloseableHttpResponse response =HttpClientBuilder.create().build().execute(httpGet);try {FileUtils.writeByteArrayToFile(dest,IOUtils.toByteArray(response.getEntity().getContent()));} finally {response.close();}}
}
运行后, 进入目录, 可以看到data.json已经899kb了.
然后我们设置ES的文档
PUT http://{ip}:9200/haoke
# 下面的注释不要复制过去哈, 会报错, 删掉注释后再发请求.{"settings": {"index": {"number_of_shards": 6,"number_of_replicas": 1}},"mappings": {"house": {"dynamic": false, # dynamic 参数来控制字段的新增, false为不允许像js一样新增字段(写入正常,不支持查询)"properties": {"title": {"type": "text","analyzer": "ik_max_word" # 选择中文分词器分词后索引},"image": {"type": "keyword","index": false # false是不允许被索引的列表},"orientation": {"type": "keyword","index": false},"houseType": {"type": "keyword","index": false},"rentMethod": {"type": "keyword","index": false},"time": {"type": "keyword","index": false},"rent": {"type": "keyword","index": false},"floor": {"type": "keyword","index": false}}}}
}
建好ES的文档后, 我们使用Junit批量导入数据
package cn.itcast.es.rest;import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpHost;
import org.apache.http.util.EntityUtils;
import org.elasticsearch.client.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;import java.io.File;
import java.io.IOException;
import java.util.List;/*** 低级别rest风格测试*/
public class TestESREST {// object 转 json 的工具private static final ObjectMapper MAPPER = new ObjectMapper();// es的rest客户端类private RestClient restClient;/*** 连接ES客户端*/@Beforepublic void init() {// 这里支持分布式ES, 你可以直接用','分割.RestClientBuilder restClientBuilder = RestClient.builder(new HttpHost("134.175.110.184", 9200, "http"));restClientBuilder.setFailureListener(new RestClient.FailureListener() {@Overridepublic void onFailure(Node node) {System.out.println("出错了 -> " + node);}});this.restClient = restClientBuilder.build();}@Afterpublic void after() throws IOException {restClient.close();}@Testpublic void tesBulk() throws Exception {Request request = new Request("POST", "/haoke/house/_bulk");List lines = FileUtils.readLines(new File("F:\\code\\data.json"),"UTF-8");String createStr = "{\"index\": {\"_index\":\"haoke\",\"_type\":\"house\"}}";StringBuilder sb = new StringBuilder();int count = 0;for (String line : lines) {sb.append(createStr + "\n" + line + "\n");if (count >= 100) {request.setJsonEntity(sb.toString());Response response = this.restClient.performRequest(request);System.out.println("请求完成 -> " + response.getStatusLine());System.out.println(EntityUtils.toString(response.getEntity()));count = 0;sb = new StringBuilder();}count++;}}
}
测试下
POST http://{ip}:9200/haoke/house/_search{"query": {"match": {"title": {"query": "上海"}}},"highlight": {"fields": {"title": {}}}
}
org.springframework.boot spring-boot-starter-data-elasticsearch
# 这个cluster-name是根据你在浏览器{ip}:9200的cluster-name确定的, 不一致的话会报错找不到
spring.data.elasticsearch.cluster-name={通过{ip}:9200查询后可知}
# 9200是RESTful端口,9300是API端口。
spring.data.elasticsearch.cluster-nodes={ip}:9300
package cn.itcast.haoke.dubbo.api.vo;import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.annotation.Id;
import org.springframework.data.elasticsearch.annotations.Document;@Data
@AllArgsConstructor
@NoArgsConstructor
@Document(indexName = "haoke", type = "house", createIndex = false)
public class HouseData {@Idprivate String id;private String title;private String rent;private String floor;private String image;private String orientation;private String houseType;private String rentMethod;private String time;
}
package cn.itcast.haoke.dubbo.api.vo;import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;import java.util.List;
import java.util.Set;@Data
@AllArgsConstructor
@NoArgsConstructor
public class SearchResult {private Integer totalPage;private List list;private Set hotWord;public SearchResult(Integer totalPage, List list) {this.totalPage = totalPage;this.list = list;}
}
package cn.itcast.haoke.dubbo.api.controller;import cn.itcast.haoke.dubbo.api.service.SearchService;
import cn.itcast.haoke.dubbo.api.vo.SearchResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.web.bind.annotation.*;import java.util.Set;@RequestMapping("search")
@RestController
@CrossOrigin
public class SearchController {@Autowiredprivate SearchService searchService;@Autowiredprivate RedisTemplate redisTemplate;private static final Logger LOGGER = LoggerFactory.getLogger(SearchController.class);@GetMappingpublic SearchResult search(@RequestParam("keyWord") String keyWord,@RequestParam(value = "page", defaultValue = "1")Integer page) {if (page > 100) { // 防止爬虫抓取过多的数据page = 1;}SearchResult search = this.searchService.search(keyWord, page);Integer count = ((Math.max(search.getTotalPage(), 1) - 1) *SearchService.ROWS) + search.getList().size();// 记录日志LOGGER.info("[Search]搜索关键字为:" + keyWord + ",结果数量为:" + count);return search;}
}
package cn.itcast.haoke.dubbo.api.service;import cn.itcast.haoke.dubbo.api.vo.HouseData;
import cn.itcast.haoke.dubbo.api.vo.SearchResult;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.elasticsearch.core.ElasticsearchTemplate;
import org.springframework.data.elasticsearch.core.aggregation.AggregatedPage;
import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilder;
import org.springframework.data.elasticsearch.core.query.SearchQuery;
import org.springframework.stereotype.Service;@Service
public class SearchService {@Autowiredprivate ElasticsearchTemplate elasticsearchTemplate;public static final Integer ROWS = 10;public SearchResult search(String keyWord, Integer page) {//设置分页参数PageRequest pageRequest = PageRequest.of(page - 1, ROWS);SearchQuery searchQuery = new NativeSearchQueryBuilder().withQuery(QueryBuilders.matchQuery("title",keyWord).operator(Operator.AND)) // match查询.withPageable(pageRequest).withHighlightFields(new HighlightBuilder.Field("title")) // 设置高亮.build();// 查询内容AggregatedPage housePage = this.elasticsearchTemplate.queryForPage(searchQuery, HouseData.class);return new SearchResult(housePage.getTotalPages(), housePage.getContent());}
}
启动,发现报错:整合了Redis后,引发了netty的冲突, 解决方案如下:
package cn.itcast.haoke.dubbo.api;import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;@SpringBootApplication
public class DubboApiApplication {public static void main(String[] args) {// 指定即可System.setProperty("es.set.netty.runtime.available.processors","false");SpringApplication.run(DubboApiApplication.class, args);}
}
测试
http://127.0.0.1:18080/search?keyWord=上海&page=2
下一篇:CUDA编程笔记(5)