爬虫


拆分过程

请求,过滤(提取),存储


爬虫的请求

maven 依赖

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.12</version>
        </dependency>

get请求

    public static void main(String[] args) {
        CloseableHttpClient client = HttpClients.createDefault(); //创建httpclient 对象。
        HttpGet httpGet = new HttpGet("https://java.ffffffff0x.com/api");  //创建get请求对象。
        CloseableHttpResponse response = null;
        try {
            response = client.execute(httpGet);   //发送get请求
            if (response.getStatusLine().getStatusCode()==200){
                String s = EntityUtils.toString(response.getEntity(),"utf-8");
                System.out.println(s);
                System.out.println(httpGet);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                response.close();
                client.close();

            } catch (IOException e) {
                e.printStackTrace();
            }

        }
    }

get携带参数请求

    public static void main(String[] args) throws URISyntaxException {
        CloseableHttpClient client = HttpClients.createDefault(); //创建httpclient 对象。
        URIBuilder uriBuilder = new URIBuilder("https://java.ffffffff0x.com/api");   //使用URIBuilder设置地址
        uriBuilder.setParameter("page","2");    //设置传入参数
        HttpGet httpGet = new HttpGet(uriBuilder.build());  //创建get请求对象。
//        https://xz.aliyun.com/?page=1
        CloseableHttpResponse response = null;
        try {
            response = client.execute(httpGet);   //发送get请求
            if (response.getStatusLine().getStatusCode()==200){
                String s = EntityUtils.toString(response.getEntity(),"utf-8");
                System.out.println(s);
                System.out.println(httpGet);

            }

        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                response.close();
                client.close();

            } catch (IOException e) {
                e.printStackTrace();
            }

        }
    }

post请求

    public static void main(String[] args) {
        CloseableHttpClient client = HttpClients.createDefault();
        HttpPost httpPost = new HttpPost("https://java.ffffffff0x.com/api");
        CloseableHttpResponse response = null;
        try {
            response = client.execute(httpPost);

                String s = EntityUtils.toString(response.getEntity());
                System.out.println(s);
                System.out.println(httpPost);

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

在 get 和 post 的请求不携带参数请求当中,get 的请求方式和 post 的请求方式基本类似。但是创建请求对象时,get 请求用的是 HttpGet 来生成对象,而 Post 则是 HttpPost 来生成对象。

post携带参数请求

    public static void main(String[] args) throws IOException {
        CloseableHttpClient client = HttpClients.createDefault();//创建httpClients对象
        HttpPost httpPost = new HttpPost("http://java.ffffffff0x.com/api"); //设置请求对象
        List<NameValuePair> params = new ArrayList<NameValuePair>();  //声明list集合,存储传入参数
        params.add(new BasicNameValuePair("page","3"));
        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf-8"); //创建表单的Entity对象,传入params参数
        httpPost.setEntity(formEntity);   //设置表单内容到post包中
        CloseableHttpResponse response  = client.execute(httpPost);
        String s = EntityUtils.toString(response.getEntity());
        System.out.println(s);
        System.out.println(s.length());
        System.out.println(httpPost);
    }

走代理

        //设置代理IP、端口、协议(请分别替换)
        HttpHost proxy = new HttpHost("127.0.0.1", 1080, "http");

        //把代理设置到请求配置
        RequestConfig defaultRequestConfig = RequestConfig.custom()
                .setProxy(proxy)
                .build();

        //实例化CloseableHttpClient对象
        CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(defaultRequestConfig).build();

        //访问目标地址
        HttpGet httpGet = new HttpGet("https://www.google.com");

        //请求返回
        CloseableHttpResponse httpResp = httpclient.execute(httpGet);
        try {
            int statusCode = httpResp.getStatusLine().getStatusCode();
            if (statusCode == HttpStatus.SC_OK) {
                System.out.println("成功");
            }
        } catch (Exception e) {

        } finally {
            httpResp.close();
        }

连接池

如果每次请求都要创建 HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。

创建一个连接池对象:

PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

public void setMaxTotal(int max)
        设置最大连接数

public void setDefaultMaxPerRoute(int max)
        设置每个主机的并发数

创建连接池代码

    public static void main(String[] args) {
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        cm.setMaxTotal(100); //设置最大连接数
        cm.setDefaultMaxPerRoute(100);   //设置每个主机的并发数
        doGet(cm);
        doGet(cm);
    }

    private static void doGet(PoolingHttpClientConnectionManager cm) {
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet("http://www.baidu.com");
        try {
            CloseableHttpResponse response = httpClient.execute(httpGet);
            String s = EntityUtils.toString(response.getEntity(),"utf-8");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

HttpClient 请求配置

    public static void main(String[] args) throws IOException {
        CloseableHttpClient client = HttpClients.createDefault(); //创建httpclient 对象。
        HttpGet httpGet = new HttpGet("http://www.baidu.com");  //创建get请求对象。
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 设置创建连接的最长时间
                .setConnectionRequestTimeout(500) //设置获取连接最长时间
                .setSocketTimeout(500).build();  //设置数据传输最长时间

        httpGet.setConfig(config);
        CloseableHttpResponse response  = client.execute(httpGet);
        String s = EntityUtils.toString(response.getEntity());
        System.out.println(s);
    }

爬虫的提取

jsoup

jsoup 是一款 Java 的 HTML 解析器,可直接解析某个 URL 地址、HTML 文本内容。它提供了一套非常省力的 API,可通过 DOM,CSS 以及类似于 jQuery 的操作方法来取出和操作数据。

jsoup 的主要功能如下:

  • 从一个 URL,文件或字符串中解析 HTML;

  • 使用 DOM 或 CSS 选择器来查找、取出数据;

  • 可操作 HTML 元素、属性、文本;

maven 依赖

        <dependency>
            <!-- jsoup HTML parser library @ http://jsoup.org/ -->
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>

来一段爬取论坛 title 的代码

    @Test
    public void testUrl() throws Exception {
        Document doc = Jsoup.parse(new URL("https://home.ffffffff0x.com/"),10000);//设置请求url与超时时间
        String title = doc.getElementsByTag("title").first().text();// //获取title的内容
        System.out.println(title);
    }

这里的 first() 代表获取第一个元素,text() 表示获取标签内容

dom遍历元素

    @Test
    public void TestDom() throws IOException {
        Document doc = Jsoup.parse(new URL("https://ffffffff0x.com/"), 10000);
        String title = doc.getElementsByTag("title").text();
        String h1 = doc.getElementsByTag("h1").text();

        String ex2 = doc.getElementsByClass("ex2").first().text();
        System.out.println("title : "+title);
        System.out.println("h1 : "+h1);
        System.out.println("ex2 : "+ex2);
    }

爬取文章

    @Test
    public void TestDom() throws IOException {
        String url = "https://www.freebuf.com/articles/network/274294.html";

        Document doc = Jsoup.parse(new URL(url), 10000);
        String title = doc.getElementsByTag("title").text();
        String time = doc.getElementsByClass("author-info").text();
        String artical = doc.getElementsByClass("artical-body").text();

        System.out.println("title : "+title);
        System.out.println("time : "+time);
        System.out.println("artical : "+artical);
    }

多线程爬取

爬虫类

public class Climbimp implements Runnable {
    private String url ;

    Lock lock = new ReentrantLock();

    public Climbimp(String url, int pages) {
        this.url = url;
    }

    public void run() {
        lock.lock();
        Document doc = null;
        try {
            doc = Jsoup.parse(new URL(url), 10000);
        } catch (IOException e) {
            e.printStackTrace();
        }
        String title = doc.getElementsByTag("title").text();
        String time = doc.getElementsByClass("author-info").text();
        String artical = doc.getElementsByClass("artical-body").text();

        System.out.println("title : "+title);
        //System.out.println("time : "+time);
        //System.out.println("artical : "+artical);

        lock.unlock();

    }
}

主类

public class main {

    public static void main(String[] args) {

        int Threadlist_num = 50; //线程数
        String url = "https://www.freebuf.com/articles/network/274294.html";  //url

        Climbimp climbimpl = new Climbimp(url,pages);

        for (int i = 0; i < Threadlist_num; i++) {
            new Thread(climbimpl).start();
        }
    }
}

这里执行会访问 50 次 https://www.freebuf.com/articles/network/274294.html , 然后就被 freebuf 封 ip 了😂


Source & Reference

  • https://www.cnblogs.com/nice0e3/p/13488064.html

  • https://blog.csdn.net/ly6cyh/article/details/77141346

  • https://www.cnblogs.com/nice0e3/p/13488064.html