初识java爬虫

通过HttpClientUtil 工具类进行爬取网页,解析网页,解析json数据,通过Io流保存解析好的数据

代码如下:

package com.ly.spider.http;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.http.impl.client.CloseableHttpClient;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by Eric on 2017/7/5.
 */
public class SpiderImpl {

    public static void main(String[] args) throws Exception {
        SpiderImpl spider = new SpiderImpl();
        spider.checkStatus();

    }
    public String checkStatus() throws  Exception{
        StringBuilder stringBuilder = new StringBuilder();
        Map<String,String> headerMap = new HashMap<String,String>();
        headerMap.put("Host","www.wdzj.com");
        headerMap.put("Connection","keep-alive");
        headerMap.put("Upgrade-Insecure-Requests","1");
        headerMap.put("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36");
        headerMap.put("Accept-Encoding","gzip, deflate, sdch");
        headerMap.put("Accept-Language","zh-CN,zh;q=0.8");
        headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
        CloseableHttpClient httpClient = HttpClientUtil.getDefaultHttpClient();
        String[] column_key = { "platName", "cityName",
                "term", "serviceAttitude" };
        Map<String,Object> paramMap = new HashMap<String,Object>();
        List<String> list = new ArrayList<>();
        for(int i=1;i<191;i++) {
            paramMap.put("params", "");
            paramMap.put("sort", "0");
            paramMap.put("currPage", i);
            HttpContext context = HttpClientUtil.doPost(httpClient,"http://www.wdzj.com/front_select-plat",paramMap,headerMap);
            String responseContent = HttpClientUtil.getResponseContent(context);
            list.add(responseContent);

        }

        JSONArray jsonArray = JSONArray.parseArray(list.toString());
        for(int i=0;i<jsonArray.size();i++){
            String result = jsonArray.getJSONObject(i).getString("list");
            JSONArray jsonArray1 = JSONArray.parseArray(result);
            for (int j=0;j<jsonArray1.size();j++){

                for (Object json: jsonArray1) {
                    JSONObject itemJson = (JSONObject) json;
                    for (String column : column_key) {
                        stringBuilder.append(itemJson.get(column) + "\t");
                    }
                    stringBuilder.append("\n");
                }
            }
        }
        String filePath = "F:\\Test\\tests.txt";
        String value =stringBuilder.toString();
        String encoding = "utf-8";
        IOUtil.writeFile(filePath,value,encoding);
        System.out.println("success");
        return null;
    }
}

IOUtils 工具类,用于保存解析好的数据到本地

package com.ly.spider.http;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

/**
 * 文件IO
 *
 * @author zel
 *
 */
public class IOUtil {
    public static void writeFile(String filePath, String value, String encoding) {
        FileOutputStream fos = null;
        try {
            fos = new FileOutputStream(new File(filePath));
            fos.write(value.getBytes(encoding));
            fos.close();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (fos != null) {
                try {
                    fos.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    public static void main(String[] args) {
        String filePath = "test.txt";
        String value = "hello world,123";
        String encoding = "utf-8";

        IOUtil.writeFile(filePath, value, encoding);

        System.out.println("done!");
    }
}


  转载请注明: Hi 高虎 初识java爬虫

 上一篇
提取多层嵌套JSON类型数据 提取多层嵌套JSON类型数据
提取多层嵌套JSON类型数据,解析多层嵌套数据,JSON与对象相互装换,JSON序列化以及反序列化操作。 数据实例: { "error": 0, "status": "su
2017-07-22
下一篇 
HttpClientUtil工具类 HttpClientUtil工具类
HttpClientUtil工具类,用于进行Post、Get请求,HttpClient 是Apache Jakarta Common 下的子项目,可以用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HT
  目录