package wallstreetcnsave;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.Mongo;
public class WallstreetcnSaveTest implements Runnable {
private static String DataBaseName = "textclassify";
private static String CollectionName = "WallstreetSaveJava";
private static String url = "http://api.wallstreetcn.com/v2/livenews?&page=";
private static String Regex = ".*?\"type\":\"(.*?)\".*?\"contentHtml\":\"
(.*?)<\\\\/p>\".*?\"categorySet\":\"(.*?)\".*?";
private static final String REGEXSTRING1 = "type";
private static final String REGEXSTRING2 = "content";
private static final String REGEXSTRING3 = "categoryset";
//map表的存放
public static Map GetMap() {
Map map = new HashMap();
map.put("1", "外汇");
map.put("2", "股市");
map.put("3", "商品");
map.put("4", "债市");
map.put("9", "中国");
map.put("10", "美国");
map.put("11", "欧元区");
map.put("12", "日本");
map.put("13", "英国");
map.put("14", "澳洲");
map.put("15", "加拿大");
map.put("16", "瑞士");
map.put("17", "其他地区");
map.put("5", "央行");
return map;
}
private static String[] ruleList_district = { "9", "10", "11", "12", "13", "14", "15", "16", "17" };
private static String[] ruleList_property = { "1", "2", "3", "4" };
private static String[] ruleList_centralbank = { "5" };
private static final int start = 1;
private static final int end = 3000;
//对x,x,x格式的内容进行分隔筛选
public static String setCategory(String categorySet, String[] ruleList, Map map) {
StringBuffer disStr = new StringBuffer();
String[] strArray = null;
strArray = categorySet.split(","); // 拆分字符为",",然后把结果交给数组strArray
// 获取需要的信息
int length_strArray = strArray.length;
int length_ruleList = ruleList.length;
if (length_strArray > 0) {
for (int iArr = 0; iArr < length_strArray; iArr++) {
String s = strArray[iArr];
for (int iRul=0; iRul < length_ruleList; iRul++) {
if (s.equals(ruleList[iRul])) {
disStr.append(map.get(s));
disStr.append(",");
break;
}
}
}
}
if(disStr.length()>1) {
disStr = disStr.deleteCharAt(disStr.length()-1);
}
return disStr.toString();
}
//读取整个页面,返回html字符串
private static String httpRequest(String requestUrl) {
StringBuffer buffer = null;
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
InputStream inputStream = null;
HttpURLConnection httpUrlConn = null;
try {
// 建立get请求
URL url = new URL(requestUrl);
httpUrlConn = (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod("GET");
// 获取输入流
inputStream = httpUrlConn.getInputStream();
inputStreamReader = new InputStreamReader(inputStream, "UTF-8");
bufferedReader = new BufferedReader(inputStreamReader);
// 从输入流获取结果
buffer = new StringBuffer();
String str = null;
while ((str = bufferedReader.readLine()) != null) {
str = new String(str.getBytes(), "UTF-8");
buffer.append(str);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (bufferedReader != null) {
try {
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (inputStreamReader != null) {
try {
inputStreamReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpUrlConn != null) {
httpUrlConn.disconnect();
}
}
return buffer.toString();
}
// 过滤掉无用的信息
public static List