博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
使用JSoup+CSSPath采集和讯网人物信息
阅读量:6584 次
发布时间:2019-06-24

本文共 5857 字,大约阅读时间需要 19 分钟。

hot3.png

使用JSoup+CSSPath采集和讯网人物信息

 

代码见

 

模型类:

 

public class Person {    private String name;    //基本信息    private Map
 basicInfos;    //教育经历    List
 educations;    //工作经历    List
 jobs;    //重要事件    List
 importants;        public String getName() {        return name;    }    public void setName(String name) {        this.name = name;    }    public Map
 getBasicInfos() {        return basicInfos;    }    public void setBasicInfos(Map
 basicInfos) {        this.basicInfos = basicInfos;    }    public List
 getEducations() {        return educations;    }    public void setEducations(List
 educations) {        this.educations = educations;    }    public List
 getJobs() {        return jobs;    }    public void setJobs(List
 jobs) {        this.jobs = jobs;    }    public List
 getImportants() {        return importants;    }    public void setImportants(List
 importants) {        this.importants = importants;    }}

 

 

 

采集器:

 

package org.apdplat.demo.collect;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Map.Entry;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class PersonCollector{    private static final Logger LOG = LoggerFactory.getLogger(PersonCollector.class);    private static final int PAGES = 298;    public List
 collect() {        List
 persons = new ArrayList<>();        try {            String url = "http://renwu.hexun.com/search.aspx?z=All&Filter=All&page=";            //共298页            for(int i=1; i
 basicInfos = new HashMap<>();                        for(Element basicElement : basicElements){                            String info = basicElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");                            if(info != null){                                String[] attrs = info.split(":");                                if(attrs != null && attrs.length == 2){                                    basicInfos.put(attrs[0], attrs[1]);                                }                            }                        }                        String moreCSSQuery = "html body div.wrap div.mainBox div.main div.contBox";                        LOG.debug("moreCSSQuery: " + moreCSSQuery);                        Elements moreElements = document.select(moreCSSQuery);                        //教育经历                        List
 educations = new ArrayList<>();                        Elements educationElements = moreElements.get(0).select("div.cont p");                        for(Element educationElement : educationElements){                            String education = educationElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");                            if(education != null && !"".equals(education.trim())){                                educations.add(education);                            }                        }                                                //工作经历                        List
 jobs = new ArrayList<>();                        Elements jobElements = moreElements.get(1).select("div.cont p");                        for(Element jobElement : jobElements){                            String job = jobElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");                            if(job != null && !"".equals(job.trim())){                                jobs.add(job);                            }                        }                                                //重要事件                        List
 importants = new ArrayList<>();                        Elements importantElements = moreElements.get(4).select("div.cont p");                        for(Element importantElement : importantElements){                            String important = importantElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");                            if(important != null && !"".equals(important.trim())){                                importants.add(important);                            }                        }                        Person person = new Person();                        person.setName(personName);                        person.setBasicInfos(basicInfos);                        person.setEducations(educations);                        person.setJobs(jobs);                        person.setImportants(importants);                        persons.add(person);                    }catch(IOException e){                        LOG.error("采集出错",e);                    }                }            }                    } catch (IOException ex) {            LOG.error("采集出错",ex);        }        return persons;    }    public static void main(String[] args) {        PersonCollector personCollector = new PersonCollector();        List
 persons = personCollector.collect();        if (persons != null) {            int i = 1;            for (Person person : persons) {                LOG.info("采集结果 " + (i++) + " "+person.getName()+ " :");                                if(person.getBasicInfos() != null && person.getBasicInfos().size() > 0){                            LOG.info("基本信息************************************************************");                    for(Entry
 basicInfo : person.getBasicInfos().entrySet()){                        LOG.info(basicInfo.getKey() +":" + basicInfo.getValue());                    }                }                if(person.getEducations() != null && person.getEducations().size() > 0){                                        LOG.info("");                    LOG.info("教育经历************************************************************");                    for(String education : person.getEducations()){                        LOG.info(education);                    }                }                if(person.getJobs() != null && person.getJobs().size() > 0){                    LOG.info("");                    LOG.info("工作经历************************************************************");                    for(String job : person.getJobs()){                        LOG.info(job);                    }                }                if(person.getImportants() != null && person.getImportants().size() > 0){                    LOG.info("");                    LOG.info("重要事件************************************************************");                    for(String important : person.getImportants()){                        LOG.info(important.replace("\\?", " "));                    }                }                LOG.info("");                LOG.info("");            }        } else {            LOG.error("没有采集到结果");        }    }}

 

 

 

 

转载于:https://my.oschina.net/apdplat/blog/397143

你可能感兴趣的文章
面试-1
查看>>
第一章,重点总结
查看>>
LeetCode - 49. Group Anagrams
查看>>
移动前端不得不了解的html5 head 头标签
查看>>
Tomcat 服务器性能优化
查看>>
【框架学习】ibatis DAO框架分析
查看>>
ZOJ 3640 Help Me Escape
查看>>
C#下实现的半角转与全角的互转
查看>>
PreparedStatement vs Statement
查看>>
使用texturePaker批量转化pvr为pn
查看>>
截取指定网站Html编码
查看>>
作业一 统计软件简介与数据操作
查看>>
css布局
查看>>
HBase-java api 基本操作
查看>>
POJ2229 Sumsets
查看>>
在LINQ-TO-SQL中实现“级联删除”的方法
查看>>
lemur run PLSA
查看>>
HTTP中的header头解析说明
查看>>
删除windows中的库、家庭组、收藏夹
查看>>
war 宽度变窄
查看>>