From 568e0530dbf30e11b463acc728eb82c0072b138a Mon Sep 17 00:00:00 2001 From: tongtzeho Date: Sun, 1 Jan 2017 11:48:08 +0800 Subject: [PATCH] Add files via upload --- Java_Programming/src/crawler/Crawler.java | 372 +++++++++++++++ .../src/crawler/DownloadPage.java | 435 ++++++++++++++++++ Java_Programming/src/crawler/Parser.java | 152 ++++++ .../src/crawler/SameFileName.java | 42 ++ .../src/crawler/TxtFileFilter.java | 34 ++ Java_Programming/src/crawler/Work.java | 201 ++++++++ 6 files changed, 1236 insertions(+) create mode 100644 Java_Programming/src/crawler/Crawler.java create mode 100644 Java_Programming/src/crawler/DownloadPage.java create mode 100644 Java_Programming/src/crawler/Parser.java create mode 100644 Java_Programming/src/crawler/SameFileName.java create mode 100644 Java_Programming/src/crawler/TxtFileFilter.java create mode 100644 Java_Programming/src/crawler/Work.java diff --git a/Java_Programming/src/crawler/Crawler.java b/Java_Programming/src/crawler/Crawler.java new file mode 100644 index 0000000..426ce74 --- /dev/null +++ b/Java_Programming/src/crawler/Crawler.java @@ -0,0 +1,372 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package crawler; + +import java.awt.*; +import java.awt.event.*; +import java.io.*; +import javax.swing.*; + +/** + * 程序名:Crawler + * 作者:北大信科计算机系11级唐子豪(1100012773)、北大信科计算机系11级骆宇冲(1100012778) + * 编译环境:Microsoft Windows 7(64-bit)下的NetBeans IDE 7.3 + * 源文件:Crawler.java, DownloadPage.java, Parser.java, SameFileName.java, TxtFileFilter.java, Work.java + * 功能: + * 1.多线程地连接互联网,获取页面源代码,在工作中随时可以停止或退出 + * 2.通过正则表达式匹配,根据用户的选择可提取URL、电子邮箱、QQ号码、日期、电驴链接等信息 + * 3.用户可自定义正则表达式,从页面源代码或在正文中提取信息 + * 4.用户可自定义URL的正则表达式,当页面含有匹配的URL时,继续连接并提取信息 + * 5.允许下载网页或URL指向的文件(如exe、mp3等) + * 6.获取网页正文(去掉源代码中的html标签和js脚本等) + * 7.下载并保存网页中的图片或网页中含有的自定义格式的文件 + * 8.通过设置代理服务器连接互联网 + * 9.请求网页时发送给定的Cookie + */ + +/** + * 主类:Crawler + * 功能:画界面,捕捉按键 + */ +public class Crawler extends JApplet { + + static JFrame frame; // 界面 + static JTextField inputurljtf, importtxtjtf, proxyaddrjtf, proxyportjtf; // 输入网址、导入文件、代理地址、代理端口的输入框 + static JTextField userdefsourcejtf, userdeftextjtf, continueurljtf, saveformatjtf, cookiejtf; // 自定义正则表达式(两个)、继续搜索的网址、存储格式、Cookie的输入框 + static ButtonGroup bgrp; // 两个单选框的组 + static JRadioButton inputurljrb, importtxtjrb; // 输入网址和导入文件的单选框 + static JButton choosefilejb, startjb, stopjb; // 浏览文件、开始、停止按钮 + static JCheckBox arrjcb[], useproxyjcb, sendcookiejcb; // 18个功能选项、是否使用代理、是否发送Cookie的复选框 + static JLabel proxyaddrjlb, proxyportjlb; // "地址"和"端口"提示标语 + static Work work = null; // 启动的工作 + + @Override + public void init() { + frame = new JFrame("Crawler"); + frame.setSize(406, 446); // 窗体大小 + frame.setLocation(350, 130); // 窗体初始位置 + frame.setResizable(false); // 不可以改变大小 + + /* 点击窗口右上角的×时退出程序 */ + frame.addWindowListener(new WindowAdapter() { + @Override + public void windowClosing(WindowEvent arg0) { + deletetempfile(); + System.out.println("退出"); + System.exit(0); + } + }); + + /* 设置输入网址的输入框 */ + inputurljtf=new JTextField("", 8192); + inputurljtf.setSize(286, 20); + inputurljtf.setLocation(110, 10); + inputurljtf.setBackground(Color.WHITE); + frame.add(inputurljtf); + + /* 设置导入文件的输入框 */ + importtxtjtf=new JTextField("", 8192); + importtxtjtf.setSize(205, 20); + importtxtjtf.setLocation(110, 40); + importtxtjtf.setBackground(Color.WHITE); + frame.add(importtxtjtf); + + /* 设置两个单选框和组 */ + bgrp = new ButtonGroup(); + inputurljrb = new JRadioButton("输入网页地址 ", true); + importtxtjrb = new JRadioButton("从txt导入网址", false); + bgrp.add(inputurljrb); + bgrp.add(importtxtjrb); + inputurljrb.setSize(110, 20); + importtxtjrb.setSize(110, 20); + inputurljrb.setLocation(0, 10); + importtxtjrb.setLocation(0, 40); + frame.add(inputurljrb); + frame.add(importtxtjrb); + + /* 设置浏览按钮 */ + choosefilejb = new JButton("浏览..."); + choosefilejb.setSize(75, 20); + choosefilejb.setLocation(320, 40); + frame.add(choosefilejb); + + arrjcb = new JCheckBox[18]; + + /* 设置提取URL的复选框 */ + arrjcb[0] = new JCheckBox("提取URL"); + arrjcb[0].setSize(92, 20); + arrjcb[0].setLocation(0, 70); + + /* 设置提取电子邮箱地址的复选框 */ + arrjcb[1] = new JCheckBox("提取电子邮箱地址"); + arrjcb[1].setSize(132, 20); + arrjcb[1].setLocation(135,70); + + /* 设置提取ip地址的复选框 */ + arrjcb[2] = new JCheckBox("提取ip地址"); + arrjcb[2].setSize(98, 20); + arrjcb[2].setLocation(270, 70); + + /* 设置提取手机号码的复选框 */ + arrjcb[3] = new JCheckBox("提取手机号码"); + arrjcb[3].setSize(113, 20); + arrjcb[3].setLocation(0, 95); + + /* 设置提取电话号码的复选框 */ + arrjcb[4] = new JCheckBox("提取电话号码"); + arrjcb[4].setSize(113, 20); + arrjcb[4].setLocation(135, 95); + + /* 设置提取QQ号码的复选框 */ + arrjcb[5] = new JCheckBox("提取QQ号码"); + arrjcb[5].setSize(99, 20); + arrjcb[5].setLocation(270, 95); + + /* 设置提取身份证号码的复选框 */ + arrjcb[6] = new JCheckBox("提取身份证号码"); + arrjcb[6].setSize(119, 20); + arrjcb[6].setLocation(0, 120); + + /* 设置提取日期的复选框 */ + arrjcb[7] = new JCheckBox("提取日期"); + arrjcb[7].setSize(93, 20); + arrjcb[7].setLocation(135, 120); + + /* 设置提取时间的复选框 */ + arrjcb[8] = new JCheckBox("提取时间"); + arrjcb[8].setSize(93, 20); + arrjcb[8].setLocation(270, 120); + + /* 设置提取电驴链接的复选框 */ + arrjcb[9] = new JCheckBox("提取电驴链接(ed2k://...)"); + arrjcb[9].setSize(176, 20); + arrjcb[9].setLocation(0, 145); + + /* 设置提取迅雷链接的复选框 */ + arrjcb[10] = new JCheckBox("提取迅雷链接(thunder://...)"); + arrjcb[10].setSize(184, 20); + arrjcb[10].setLocation(186, 145); + + /* 设置从源代码提取自定义正则表达式内容的复选框和对应的输入框 */ + arrjcb[11] = new JCheckBox("从源代码提取以下内容"); + arrjcb[11].setSize(160, 20); + arrjcb[11].setLocation(0, 175); + userdefsourcejtf = new JTextField("", 8192); + userdefsourcejtf.setSize(234, 20); + userdefsourcejtf.setLocation(162, 175); + frame.add(userdefsourcejtf); + + /* 设置从正文提取自定义正则表达式内容的复选框和对应的输入框 */ + arrjcb[12] = new JCheckBox("在正文中提取以下内容"); + arrjcb[12].setSize(160, 20); + arrjcb[12].setLocation(0, 205); + userdeftextjtf = new JTextField("", 8192); + userdeftextjtf.setSize(234, 20); + userdeftextjtf.setLocation(162, 205); + frame.add(userdeftextjtf); + + /* 设置继续爬网页的复选框和对应的输入框 */ + arrjcb[13] = new JCheckBox("继续爬以下网页"); + arrjcb[13].setSize(119, 20); + arrjcb[13].setLocation(0, 235); + continueurljtf = new JTextField("", 8192); + continueurljtf.setSize(275, 20); + continueurljtf.setLocation(121, 235); + frame.add(continueurljtf); + + /* 设置保存目标的复选框 */ + arrjcb[14] = new JCheckBox("保存目标"); + arrjcb[14].setSize(93, 20); + arrjcb[14].setLocation(0, 265); + + /* 设置保存网页正文的复选框 */ + arrjcb[15] = new JCheckBox("保存网页正文"); + arrjcb[15].setSize(113, 20); + arrjcb[15].setLocation(135, 265); + + /* 设置下载网页图片的复选框 */ + arrjcb[16] = new JCheckBox("下载网页图片"); + arrjcb[16].setSize(113, 20); + arrjcb[16].setLocation(270, 265); + + /* 设置下载指定格式文件的复选框和对应的输入框 */ + arrjcb[17] = new JCheckBox("下载以下格式的文件"); + arrjcb[17].setSize(142, 20); + arrjcb[17].setLocation(0, 295); + saveformatjtf = new JTextField("", 8192); + saveformatjtf.setSize(252, 20); + saveformatjtf.setLocation(144, 295); + frame.add(saveformatjtf); + + /* 设置以上复选框的字体并显示 */ + int i; + for(i = 0; i <= 17; i++) { + arrjcb[i].setFont(new Font(Font.DIALOG, Font.PLAIN, 13)); + frame.add(arrjcb[i]); + } + + /* 设置使用代理服务器的复选框 */ + useproxyjcb = new JCheckBox("使用代理服务器"); + useproxyjcb.setSize(119, 20); + useproxyjcb.setLocation(0, 325); + useproxyjcb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13)); + frame.add(useproxyjcb); + + /* 设置代理地址的提示 */ + proxyaddrjlb = new JLabel("地址:"); + proxyaddrjlb.setSize(33, 20); + proxyaddrjlb.setLocation(139, 325); + proxyaddrjlb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13)); + frame.add(proxyaddrjlb); + + /* 设置代理地址的输入框 */ + proxyaddrjtf = new JTextField("", 256); + proxyaddrjtf.setSize(108, 20); + proxyaddrjtf.setLocation(173, 325); + frame.add(proxyaddrjtf); + + /* 设置代理端口的提示 */ + proxyportjlb = new JLabel("端口:"); + proxyportjlb.setSize(33, 20); + proxyportjlb.setLocation(306, 325); + proxyportjlb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13)); + frame.add(proxyportjlb); + + /* 设置代理端口的输入框 */ + proxyportjtf = new JTextField("", 16); + proxyportjtf.setSize(56, 20); + proxyportjtf.setLocation(340, 325); + frame.add(proxyportjtf); + + /* 设置发送Cookie的复选框 */ + sendcookiejcb = new JCheckBox("发送Cookie"); + sendcookiejcb.setSize(97, 20); + sendcookiejcb.setLocation(0, 355); + sendcookiejcb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13)); + frame.add(sendcookiejcb); + + /* 设置Cookie的输入框 */ + cookiejtf = new JTextField("", 8192); + cookiejtf.setSize(295, 20); + cookiejtf.setLocation(101, 355); + frame.add(cookiejtf); + + /* 设置开始按钮 */ + startjb = new JButton("开 始 (Enter)"); + startjb.setSize(190, 23); + startjb.setLocation(4, 386); + frame.add(startjb); + + /* 设置停止按钮 */ + stopjb = new JButton("停 止"); + stopjb.setSize(190, 23); + stopjb.setLocation(205, 386); + stopjb.setBackground(Color.LIGHT_GRAY); + frame.add(stopjb); + + frame.setLayout(null); // 取消默认布局管理器 + } + + @Override + public void start() { + + /* 点击"浏览"按钮时的响应 */ + ActionListener choosefileal = new ActionListener() { + @Override + public void actionPerformed(ActionEvent ae) { + /* 若已开始工作则不响应 */ + if (work == null || !work.isAlive()) { + importtxtjrb.setSelected(true); + + /* 弹出选择文件的对话框 */ + JFileChooser jfc = new JFileChooser ("."); + jfc.setAcceptAllFileFilterUsed(false); + jfc.addChoosableFileFilter(new TxtFileFilter()); + int result = jfc.showOpenDialog(null); + if(result == JFileChooser.APPROVE_OPTION) { + String path = jfc.getSelectedFile().getAbsolutePath(); + importtxtjtf.setText(path); + } + } + } + }; + choosefilejb.addActionListener(choosefileal); + + final Crawler crawler = this; + + /* 点击"开始"按钮时的响应 */ + ActionListener startal = new ActionListener() { + @Override + public void actionPerformed(ActionEvent ae) { + /* 若已开始工作则不响应 */ + if (work == null || !work.isAlive()) { + System.out.println("开始"); + work = new Work(); // 启动一项新工作 + work.start(); + } + } + }; + startjb.addActionListener(startal); + + /* 点击停止按钮时的响应 */ + ActionListener stopal = new ActionListener() { + @Override + public void actionPerformed(ActionEvent ae) { + /* 若当前没有工作则不响应 */ + if (work != null && work.isAlive()) { + work.needstop = true; // 依次将所有子线程的needstop设为true,使各子线程尽快终止 + System.out.println("准备停止..."); + deletetempfile(); // 删除临时文件 + while (work.isAlive()) {} + System.out.println("已停止"); + } + } + }; + stopjb.addActionListener(stopal); + + /* 在任意一个输入框按回车键视为点击"开始"按钮 */ + addenterlistener(inputurljtf); + addenterlistener(importtxtjtf); + addenterlistener(userdefsourcejtf); + addenterlistener(userdeftextjtf); + addenterlistener(continueurljtf); + addenterlistener(saveformatjtf); + addenterlistener(proxyaddrjtf); + addenterlistener(proxyportjtf); + addenterlistener(cookiejtf); + + frame.setVisible(true); // 将界面显示 + + } + + void addenterlistener(JTextField jtf) { + jtf.addKeyListener(new KeyAdapter() { + @Override + public void keyPressed(KeyEvent event) + { + if (event.getKeyText(event.getKeyCode()).compareToIgnoreCase("Enter")==0) { + startjb.doClick(); // 模拟点击"开始"按钮 + } + } + }); + } + + /* 删除所有临时文件 */ + void deletetempfile() { + Integer i; + for (i = 1; i <= 500; i++) { + new File("~tmp"+i.toString()).delete(); + new File("~"+i.toString()+"saveurls.txt").delete(); + } + } + + /* 主函数 */ + public static void main(String[] args) { + JApplet applet = new Crawler(); + System.out.println("欢迎使用java版网络爬虫Crawler\n作者:北大信科 - 唐子豪&骆宇冲"); + applet.init(); + applet.start(); + } +} diff --git a/Java_Programming/src/crawler/DownloadPage.java b/Java_Programming/src/crawler/DownloadPage.java new file mode 100644 index 0000000..d846cc7 --- /dev/null +++ b/Java_Programming/src/crawler/DownloadPage.java @@ -0,0 +1,435 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package crawler; + +import java.io.*; +import java.net.*; +import java.util.*; +import java.util.concurrent.*; +import java.util.regex.*; + +/** + * + * @author 1100012773, 1100012778 + */ + +/** + * DownloadPage类 + * 功能:给定URL等信息,连接互联网下载信息,交由Parser类提取数据,如果用户有需要则保存相应内容 + */ +public class DownloadPage extends Thread { + private Integer tempfileid; // 临时文件对应的序号 + private String url; // 网页链接 + private String tempfilename; // 临时文件名 + private String objectname; // 目标名称(如果URL对应的是网页则取网页标题,否则直接从URL中提取) + private String userdefsource; // 用户自定义的正则表达式(对应从源代码中匹配) + private String userdeftext; // 用户自定义的正则表达式(对应从正文中匹配) + private String continueurl; // 要继续爬的URL的正则表达式 + private String saveformat; // 要存储的文件格式 + private boolean useproxy = false, sendcookie = false; // 是否使用代理服务器、是否发送Cookie + private String proxyaddr; // 代理地址 + private String cookie; // Cookie内容 + private int proxyport; // 代理端口 + private boolean need[]; // 对应18个复选框的真值 + private boolean error = false; // 是否出错 + private CountDownLatch runningthreadnum; // 当前线程数 + boolean needstop; // 是否需要停止(由Work控制) + + /* 构造函数 */ + public DownloadPage(String u, Integer tfi, boolean n[], CountDownLatch rtn) { + super(); + url = u; + tempfileid = tfi; + tempfilename = "~tmp"+tfi.toString(); + need = new boolean[18]; + int i; + for (i = 0 ;i <= 17; i++) + need[i] = n[i]; + runningthreadnum = rtn; + needstop = false; + } + + /* 获取代理服务器信息 */ + public void getproxyinfo(String paddr, String pp) { + useproxy = true; + proxyaddr = paddr; + proxyport = Integer.valueOf(pp); + } + + /* 获取Cookie内容 */ + public void getcookiecontent(String cookie) { + sendcookie = true; + this.cookie = cookie; + } + + + /* 获取输入框内容 */ + public void gettext(String userdefsource, String userdeftext, String continueurl, String saveformat) { + this.userdefsource = userdefsource; + this.userdeftext = userdeftext; + this.continueurl = continueurl; + this.saveformat = saveformat; + } + + @Override + public void run() { + try { + System.out.println("准备连接 - "+url); + String charcode = null; + if (!needstop) charcode = getcharcodefromurl(); // 分析网页编码类型 + String content = null; + if (!needstop) content = getcontentfromurl(charcode); // 获取网页内容 + if (!needstop) outputtotempfile(content, charcode); // 将网页内容输出到临时文件中 + if (!error && !needstop) { + Parser parser = new Parser(url, tempfileid, content, need); + String title = parser.gettitle(); // 获取网页标题 + if (title.equals("")) System.out.println("连接成功 - "+url); + else System.out.println("连接成功 - "+title); + parser.parse(userdefsource, userdeftext, continueurl, saveformat); // 分析网页内容并提取需要的数据 + if (title.equals("")) System.out.println("提取信息成功 - "+url); + else System.out.println("提取信息成功 - "+title); + + /* 若需要保存目标…… */ + if (need[14] && !needstop) { + /* 该目标为网页且有非空标题 */ + if (!title.equals("")) { + File file = new File(tempfilename); + if (file.exists()) { + new File("下载\\目标").mkdirs(); + System.out.println("目标保存为 - "+SameFileName.newfilename("下载\\目标\\", title+".html")); + file.renameTo(new File("下载\\目标\\"+SameFileName.newfilename("下载\\目标\\", title+".html"))); // 将之前的临时文件重命名即可 + } + } + /* 该目标不是网页或是网页但没有非空标题 */ + else { + objectname = SameFileName.newfilename("下载\\目标\\", getobjname(url)); + new File("下载\\目标").mkdirs(); + System.out.println("目标保存为 - "+objectname); + downloadbybyte(url, "下载\\目标\\"+objectname); // 重新下载(因为之前的下载很可能会丢失部分特殊字符的数据) + } + } + + /* 若需要保存网页正文…… */ + if (need[15] && !needstop) { + String textname = null; + if (!title.equals("")) { + textname = title+".txt"; + } + else { + textname = getobjname(url)+".txt"; + } + new File("下载\\网页正文").mkdirs(); + textname = SameFileName.newfilename(("下载\\网页正文\\"), textname); + System.out.println("网页正文保存为 - "+textname); + outputtext(content, charcode, "下载\\网页正文\\"+textname); // 将网页正文输出到指定的文件 + } + + /* 若需要下载网页中特定格式的文件…… */ + if (need[16] || need[17]) { + Set filesuffixset = new HashSet(); + /* 需要下载图片 */ + if (need[16]) { + filesuffixset.add(".jpg"); + filesuffixset.add(".gif"); + filesuffixset.add(".png"); + filesuffixset.add(".jpeg"); + filesuffixset.add(".bmp"); + } + /* 需要下载自定义格式的文件 */ + if (need[17]) { + Pattern pattern = Pattern.compile("\\.(\\w|\\-|\\_)+", Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(saveformat); + while (matcher.find()) { + filesuffixset.add(saveformat.substring(matcher.start(), matcher.end()).toLowerCase()); + } + } + if (!filesuffixset.isEmpty() && !needstop) downloadfile(content, "下载 - "+title, filesuffixset); // 下载文件 + } + + } + } catch (MalformedURLException ex) { + System.out.println("url格式不对 - "+url); // url格式不对 + error = true; + } catch (IOException ex) { + System.out.println("网络连接异常 - "+url); // 网络连接异常 + error = true; + } + + File file = new File(tempfilename); + if (file.exists()) file.delete(); // 删除临时文件 + runningthreadnum.countDown(); // 线程数减一 + } + + /* 获取网页编码类型 */ + private String getcharcodefromurl() { + try { + /* 若需要使用代理服务器则设置代理 */ + SocketAddress add = null; + if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); + Proxy proxy = null; + if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add); + + /* 连接网络,获取网页头信息 */ + URL u = new URL(url); + HttpURLConnection urlconnection = null; + if (useproxy) urlconnection = (HttpURLConnection)u.openConnection(proxy); + else urlconnection = (HttpURLConnection)u.openConnection(); + + if (sendcookie) urlconnection.setRequestProperty("Cookie", cookie); + urlconnection.connect(); + + String charcode = null; + + /* 分析网页头信息 */ + Map> map = urlconnection.getHeaderFields(); + Set keys = map.keySet(); + Iterator iterator = keys.iterator(); + + String key = null; + String tmp = null; + while (iterator.hasNext()) { + if (needstop) return "UTF-8"; + key = iterator.next(); + tmp = map.get(key).toString().toLowerCase(); + + /* 若网页头中含有"Content-Type"项且含有"charset="字段,则提取信息并返回 */ + if (key != null && key.equals("Content-Type")) { + int m = tmp.indexOf("charset="); + if (m != -1) { + charcode = tmp.substring(m + 8).replace("]", ""); + return charcode; + } + } + } + + if (needstop) return "UTF-8"; + + /* 重新连接,逐行提取网页源代码,再从源代码中寻找字符编码信息 */ + HttpURLConnection conn = null; + if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy)); + else conn = (HttpURLConnection)(new URL(url).openConnection()); + + if (sendcookie) conn.setRequestProperty("Cookie", cookie); + conn.connect(); + + BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); + StringBuilder sb = new StringBuilder(); + String line; + + while ((line = reader.readLine()) != null) { + if (needstop) return "UTF-8"; + line = line.toLowerCase(); + /* 在读取的字符串中寻找"charset="字段 */ + int indexofcharset = line.indexOf("charset="); + if (indexofcharset > 0) { + line = line.substring(indexofcharset); + int indexofquotation = line.indexOf("\""); + if (indexofquotation > 0) { + return line.substring(8, indexofquotation); + } + } + } + } catch (MalformedURLException ex) {} + catch (IOException ex) {} + + return "UTF-8"; // 默认是UTF-8编码 + } + + /* 获取网页内容 */ + private String getcontentfromurl(String charcode) { + try { + /* 若需要使用代理服务器则先设置代理 */ + SocketAddress add = null; + if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); + Proxy proxy = null; + if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add); + + /* 连接网络 */ + HttpURLConnection conn = null; + if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy)); + else conn = (HttpURLConnection)(new URL(url).openConnection()); + + if (sendcookie) conn.setRequestProperty("Cookie", cookie); + conn.connect(); + + InputStream is = conn.getInputStream(); + if (needstop) return ""; + String content = readfromstream(is, charcode); // 在InputStream中获取内容 + return content; + } catch(MalformedURLException e) { + System.out.println("url格式不对 - "+url); + error = true; + } catch (IOException ex) { + System.out.println("网络连接异常 - "+url); + error = true; + } + return ""; + } + + /* 从网页给的输入流中获取内容并保存在String中 */ + private String readfromstream(InputStream stream, String charcode) throws IOException { + try { + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charcode)); + StringBuilder sb = new StringBuilder(); + String line; + + /* 逐行读取数据 */ + while ((line = reader.readLine()) != null) { + if (needstop) return ""; + sb.append(line+"\r\n"); + } + return sb.toString(); + } catch (UnsupportedEncodingException ex) { + System.out.println("无法识别的编码方式 - "+url); + error = true; + } + return ""; + } + + /* 将网页内容输出到临时文件 */ + private void outputtotempfile(String content, String charcode) { + File file = new File(tempfilename); + FileOutputStream fos = null; + Writer out = null; + try { + fos = new FileOutputStream(file, false); + out = new OutputStreamWriter(fos, charcode); + out.write(content); + } catch (FileNotFoundException ex) { + System.out.println("临时文件名出错"); + error = true; + } catch (IOException ex) { + System.out.println("临时文件输出失败"); + error = true; + } finally { + try { + if (out != null) out.close(); + if (fos != null) fos.close(); + } catch (IOException ex) { + System.out.println("无法关闭临时文件"); + } + } + } + + /* 逐字节地下载网页目标 */ + private boolean downloadbybyte(String url, String savefile) throws MalformedURLException { + boolean succeed = true; + URL u = new URL(url); + DataInputStream dis = null; + FileOutputStream fos = null; + try { + if (needstop) return false; + /* 设置代理、连网等同上 */ + SocketAddress add = null; + if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); + Proxy proxy = null; + if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add); + HttpURLConnection conn = null; + if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy)); + else conn = (HttpURLConnection)(new URL(url).openConnection()); + + if (sendcookie) conn.setRequestProperty("Cookie", cookie); + conn.connect(); + + dis = new DataInputStream(conn.getInputStream()); + if (needstop) return false; + fos = new FileOutputStream(new File(savefile)); + byte buffer[] = new byte[65536]; + int length; + + /* 每次固定读若干字节的内容 */ + while ((length = dis.read(buffer)) > 0) { + fos.write(buffer, 0, length); + if (needstop) return false; + } + } catch (IOException ex) { + System.out.println("下载失败 - "+savefile.substring(savefile.lastIndexOf("\\")+1)); + succeed = false; + } finally { + try { + if (dis != null) dis.close(); + if (fos != null) fos.close(); + } catch (IOException ex) { + System.out.println("关闭下载的文件失败"); + } + } + return succeed; + } + + /* 根据URL获取目标名称 */ + private String getobjname(String url) { + if (needstop) return url; + String destfile = new String(url); + if (destfile.endsWith("/")) destfile = destfile.substring(0, destfile.length() - 1); + int lastslashpos = destfile.lastIndexOf("/"); + destfile = destfile.substring(lastslashpos + 1); + int lastdotpos = destfile.lastIndexOf("."); + if (lastdotpos > 0) { + int tmpindex = destfile.indexOf("?", lastdotpos); + if (tmpindex > 0) destfile = destfile.substring(0, tmpindex); + if ((tmpindex = destfile.indexOf("%", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex); + if ((tmpindex = destfile.indexOf("&", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex); + if ((tmpindex = destfile.indexOf("=", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex); + if ((tmpindex = destfile.indexOf("+", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex); + if ((tmpindex = destfile.indexOf(":", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex); + } + destfile = destfile.replace(":", "").replace("<","").replace(">","").replace("?","").replace("|","").replace("*","").replace("/","").replace("\\","").replace("\"", ""); + if (destfile.length() > 127) destfile = destfile.substring(destfile.length() - 127); + return destfile; + } + + /* 输入网页正文 */ + private void outputtext(String content, String charcode, String savefile) { + if (needstop) return; + String text = new String(content); + /* 删去所有js脚本、html标签,并将&***;变为原有字符 */ + text = text.replaceAll("<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)[^>]*?>[\\s\\S]*?", "").replaceAll("<(s|S)(t|T)(y|Y)(l|L)(e|E)[^>]*?>[\\s\\S]*?","").replaceAll("
","\r\n").replaceAll("<[\\s\\S]*?>", "").replaceAll("(\r\n)+","\r\n").replaceAll("(\\s)+"," ").replaceAll("\\&((nbsp)|(\\#12288)|(\\#160))(\\;)?", " ").replaceAll("\\&((lt)|(\\#60))(\\;)?","<").replaceAll("\\&((gt)|(\\#62))(\\;)?",">").replaceAll("\\&((quot)|(#34))(\\;)?","\"").replaceAll("\\&((apos)|(\\#39))(\\;)?","'").replaceAll("\\©(\\;)?","©").replaceAll("\\®(\\;)?","®").replaceAll("\\&((amp)|(#38))(\\;)?","&"); + File file = new File(savefile); + FileOutputStream fos = null; + Writer out = null; + try { + fos = new FileOutputStream(file, false); + out = new OutputStreamWriter(fos, charcode); + if (needstop) return; + out.write(text); + } catch (FileNotFoundException ex) { + System.out.println("网页正文文件名出错"); + error = true; + } catch (IOException ex) { + System.out.println("网页正文输出失败"); + error = true; + } finally { + try { + if (out != null) out.close(); + if (fos != null) fos.close(); + } catch (IOException ex) { + System.out.println("无法关闭网页正文文件"); + } + } + } + + /* 下载指定格式的文件 */ + void downloadfile(String content, String dir, Set suffixset) throws MalformedURLException, IOException { + Set downloaded = new HashSet(); + Pattern pattern = Pattern.compile("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?", Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(content); + + /* 从网页源码中提取指定格式的URL */ + while (matcher.find()) { + if (needstop) return; + String url = content.substring(matcher.start(), matcher.end()); + String destfile = getobjname(url); + destfile = SameFileName.newfilename("下载\\"+dir+"\\", destfile); + int lastdotpos = destfile.lastIndexOf("."); + if (lastdotpos > 0) { + if (suffixset.contains(destfile.substring(lastdotpos).toLowerCase()) && downloaded.add(url.toLowerCase())) { + new File("下载\\"+dir).mkdirs(); + if (downloadbybyte(url, "下载\\"+dir+"\\"+destfile)) System.out.println("下载成功 - "+destfile); /* 下载该URL */ + } + } + } + } + +} diff --git a/Java_Programming/src/crawler/Parser.java b/Java_Programming/src/crawler/Parser.java new file mode 100644 index 0000000..bc41a62 --- /dev/null +++ b/Java_Programming/src/crawler/Parser.java @@ -0,0 +1,152 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package crawler; + +import java.io.*; +import java.util.*; +import java.util.regex.*; + +/** + * + * @author 1100012773, 1100012778 + */ + +/** + * Parser类 + * 功能:分析网页源文件,提取需要的信息并输出 + */ +public class Parser { + + private String url; // 网页链接 + private String tempfilename; // 输出的临时文件名 + private String content; // 网页源代码 + private boolean need[]; // 对应DownloadPage的need + + /* 构造函数 */ + public Parser(String u, Integer tfi, String ct, boolean n[]) { + url = u; + tempfilename = "~"+tfi.toString()+"saveurls.txt"; + content = ct; + int i; + need = new boolean[18]; + for (i = 0; i < 17; i++) + need[i] = n[i]; + } + + /* 分析源代码内容,提取需要的信息并输出 */ + public void parse(String userdefsource, String userdeftext, String continueurl, String saveformat) { + String title = gettitle(); + try { + String text = content.replace("\n","").replace("\r","").replaceAll("<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)[^>]*?>.*?", "").replaceAll("<(s|S)(t|T)(y|Y)(l|L)(e|E)[^>]*?>.*?","").replaceAll("<.*?>", "").replaceAll("\\&((nbsp)|(\\#12288)|(\\#160))(\\;)?", " ").replaceAll("\\&((lt)|(\\#60))(\\;)?","<").replaceAll("\\&((gt)|(\\#62))(\\;)?",">").replaceAll("\\&((quot)|(#34))(\\;)?","\"").replaceAll("\\&((apos)|(\\#39))(\\;)?","'").replaceAll("\\©(\\;)?","©").replaceAll("\\®(\\;)?","®").replaceAll("\\&((amp)|(#38))(\\;)?","&"); + + /* regex[0]到regex[10]为URL、电子邮箱地址、ip地址、手机号码、电话号码、QQ号码、身份证号码、日期、时间、电驴链接、迅雷链接的正则表达式 */ + String regex[] = new String[18]; + regex[0] = new String("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?"); + regex[1] = new String("[\\w]+([\\.\\_\\-]*[\\w])*\\@([\\w]+[\\w\\-]*[\\w]+\\.)+[\\w]+"); + regex[2] = new String("(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))"); + regex[3] = new String("((\\+)?86(\\-)?)?(1)(((3|5|8)[0-9])|(47))[0-9]{8}"); + regex[4] = new String("(((0?)|[1-9])(0?|[1-9])([0-9][0-9])\\-)?[1-9]([0-9]{6})[0-9]?((\\-)[0-9]{1,4})?"); + regex[5] = new String("[1-9][0-9]{4,9}"); + regex[6] = new String("(([1-5][1-9])|6[1-5]|(71)|(81)|(82))([0-9]{4})((18)|(19)|(20))([0-9]{2})((0[1-9])|(11)|(12))(([0-2][0-9])|30|31)[0-9]{3}([0-9]|x|X)"); + regex[7] = new String("([0-9]{2,4}(\\-)((0?[1-9])|(10)|(11)|(12))(\\-)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|([0-9]{2,4}(\\.)((0?[1-9])|(10)|(11)|(12))(\\.)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|([0-9]{2,4}(\\/)((0?[1-9])|(10)|(11)|(12))(\\/)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\-((0?[1-9])|(10)|(11)|(12))\\-([0-9]{2,4}))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\.((0?[1-9])|(10)|(11)|(12))\\.([0-9]{2,4}))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\/((0?[1-9])|(10)|(11)|(12))\\/([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\-(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\-([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\.(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\.([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\/(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\/([0-9]{2,4}))"); + regex[8] = new String("(((1[0-9])|(2[0-3])|(0?[0-9]))\\:([0-5][0-9])(\\:[0-5][0-9])?)|(24\\:00(\\:00)?)"); + regex[9] = new String("ed2k://\\|file\\|[\\w\\-\\%\\(\\)\\[\\]\\.\\!]*[\\w]+\\|[0-9]+\\|[0-9A-F]+\\|((p|h)\\=[0-9A-Z]+(\\|)?)?(\\/)?"); + regex[10] = new String("thunder://[\\w\\+\\/\\=]+"); + regex[11] = userdefsource; // 用户自定义的正则表达式1 + regex[12] = userdeftext; // 用户自定义的正则表达式2 + regex[13] = new String("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?"); // 同为URL正则表达式 + + /* 输出文件的前缀以及保存的文件夹的名称 */ + String filetitleprefix[] = new String[18]; + filetitleprefix[0] = new String("URL"); + filetitleprefix[1] = new String("电子邮箱地址"); + filetitleprefix[2] = new String("ip地址"); + filetitleprefix[3] = new String("手机号码"); + filetitleprefix[4] = new String("电话号码"); + filetitleprefix[5] = new String("QQ号码"); + filetitleprefix[6] = new String("身份证号码"); + filetitleprefix[7] = new String("日期"); + filetitleprefix[8] = new String("时间"); + filetitleprefix[9] = new String("电驴链接"); + filetitleprefix[10] = new String("迅雷链接"); + filetitleprefix[11] = new String("自定义从源代码提取"); + filetitleprefix[12] = new String("自定义在正文中提取"); + + int i; + for (i = 0; i <= 13; i++) { + if (need[i]) { + Pattern pattern = Pattern.compile(regex[i], Pattern.CASE_INSENSITIVE); + String matchstr; + if ((i >= 3 && i <= 6) || (i >= 9 && i <= 10) || (i == 12)) matchstr = text; // 手机号码、电话号码、QQ号码、身份证号码、电驴链接、迅雷链接、自定义1在正文中提取 + else matchstr = content; // 其余从源代码中提取 + Matcher matcher = pattern.matcher(matchstr); + Set set = new HashSet(); + while (matcher.find()) { + String newitem = matchstr.substring(matcher.start(), matcher.end()); + set.add(newitem); + } + + /* 额外分析形如" iter = set.iterator(); + while (iter.hasNext()) { + out.write(iter.next()+"\r\n"); + } + out.close(); + fos.close(); + } + } + + } catch (PatternSyntaxException ex) { + System.out.println("自定义正则表达式语法错误"); + } catch (FileNotFoundException ex) { + System.out.println("输出失败"); + } catch (UnsupportedEncodingException ex) { + System.out.println("无法识别的编码方式"); + } catch (IOException ex) { + System.out.println("输出失败"); + } + } + + /* 从源代码的...段中获取网页标题 */ + public String gettitle() { + String contentwithoutline = content; + contentwithoutline = contentwithoutline.replace("\n", "").replace("\r", "").replaceAll("[\\s]+"," "); + String regex = "[\\s\\S]*?"; + String title = ""; + Pattern pattern = Pattern.compile(regex, Pattern.CANON_EQ); + Matcher matcher = pattern.matcher(contentwithoutline); + if (matcher.find()) { + title = contentwithoutline.substring(matcher.start(), matcher.end()).replaceAll("<.*?>", "").replace(":", "").replace("<","").replace(">","").replace("?","").replace("|","").replace("*","").replace("/","").replace("\\","").replace("\"", ""); + if (title.startsWith(" ")) title = title.substring(1); + return title; + } + return ""; + } + +} diff --git a/Java_Programming/src/crawler/SameFileName.java b/Java_Programming/src/crawler/SameFileName.java new file mode 100644 index 0000000..02fef2b --- /dev/null +++ b/Java_Programming/src/crawler/SameFileName.java @@ -0,0 +1,42 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package crawler; + +import java.io.*; +/** + * + * @author 1100012773, 1100012778 + */ + +/** + * SameFileName类 + * 若某个目录下已存在某文件,更改新文件的文件名(如a.txt->a[2].txt,ab[3]->ab[4]) + */ +public class SameFileName { + + public static String newfilename(String dir, String oldfilename) { + String filename = oldfilename; + File file = new File(dir+oldfilename); + if (!file.exists()) return filename; + int lastdotpos = oldfilename.lastIndexOf("."); + Integer index = 1; + if (lastdotpos == -1) { + while (true) { + index++; + filename = oldfilename+"["+index.toString()+"]"; + if (!new File(dir+filename).exists()) return filename; + } + } + else { + String suffix = oldfilename.substring(lastdotpos); + while (true) { + index++; + filename = oldfilename.substring(0, lastdotpos)+"["+index.toString()+"]"+suffix; + if (!new File(dir+filename).exists()) return filename; + } + } + } + +} diff --git a/Java_Programming/src/crawler/TxtFileFilter.java b/Java_Programming/src/crawler/TxtFileFilter.java new file mode 100644 index 0000000..77cc8ad --- /dev/null +++ b/Java_Programming/src/crawler/TxtFileFilter.java @@ -0,0 +1,34 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package crawler; + +import java.io.File; +import javax.swing.filechooser.*; + +/** + * + * @author 1100012773, 1100012778 + */ + +/** + * TxtFileFilter类 + * 文件筛选器,只用于点击"浏览"按钮弹出的选择文件的对话框 + */ +class TxtFileFilter extends FileFilter { + + @Override + public boolean accept(File f) { + if(f.isDirectory()) { + return true; // 显示文件夹 + } + String nameString = f.getName(); + return nameString.toLowerCase().endsWith(".txt"); // 显示txt文件 + } + + @Override + public String getDescription() { + return "文本文件 (*.txt)"; // 类型提示 + } +} diff --git a/Java_Programming/src/crawler/Work.java b/Java_Programming/src/crawler/Work.java new file mode 100644 index 0000000..c0ec2bc --- /dev/null +++ b/Java_Programming/src/crawler/Work.java @@ -0,0 +1,201 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package crawler; + +import java.awt.Color; +import java.util.*; +import java.io.*; +import java.util.concurrent.*; +import java.util.regex.*; + +/** + * + * @author 1100012773, 1100012778 + */ + +/** + * Work类 + * 网络爬虫的工作类,启动DownloadPage线程,并保存要连接的URL的队列 + */ +public class Work extends Thread { + + boolean needstop; // 是否需要停止(由Crawler控制) + + /* 构造函数 */ + public Work() { + needstop = false; + } + + @Override + public void run() { + /* 开始工作时,更改三个按钮的颜色 */ + Crawler.startjb.setBackground(Color.LIGHT_GRAY); + Crawler.stopjb.setBackground(null); + Crawler.choosefilejb.setBackground(Color.LIGHT_GRAY); + + Set set = new HashSet(); // 记录连接的URL + Queue queue = new LinkedList(); // 记录即将连接的URL + + /* 直接输入网址 */ + if(Crawler.inputurljrb.isSelected()) { + queue.offer(Crawler.inputurljtf.getText()); + } + + /* 从txt文件中导入网址 */ + else { + String importfilename = Crawler.importtxtjtf.getText(); + FileInputStream fis = null; + InputStreamReader isr = null; + BufferedReader br = null; + try { + String readstr; + fis = new FileInputStream(importfilename); + isr = new InputStreamReader(fis); + br = new BufferedReader(isr); + while ((readstr = br.readLine()) != null) { + queue.offer(readstr); + } + } catch(FileNotFoundException e) { + System.out.println("找不到指定文件"); + } catch(IOException e) { + System.out.println("读取文件失败"); + } finally { + try { + if (br != null) br.close(); + if (isr != null) isr.close(); + if (fis != null) fis.close(); + } catch (IOException ex) { + System.out.println("关闭文件失败"); + } + } + } + + /* 判断是否需要代理服务器,若需要则读取代理服务器的地址和端口 */ + boolean useproxy = Crawler.useproxyjcb.isSelected(); + if (useproxy) { + if (!Pattern.matches("(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))", Crawler.proxyaddrjtf.getText())) { + useproxy = false; + System.out.println("代理服务器地址格式错误"); + } + else if (!Pattern.matches(("[0-9]{1,5}"), Crawler.proxyportjtf.getText())) { + useproxy = false; + System.out.println("代理服务器端口格式错误"); + } + } + + /* 是否需要发送Cookie */ + boolean sendcookie = Crawler.sendcookiejcb.isSelected(); + + /* 直到即将连接的URL为空才结束循环 */ + while (!queue.isEmpty()) { + if (needstop) break; + String str; + int queuesize = queue.size(); + if (queuesize > 500) queuesize = 500; // 设置DownloadPage线程数最多为500 + Integer i; + boolean b[] = new boolean[18]; + for (i = 0; i <= 17; i++) + b[i] = Crawler.arrjcb[i].isSelected(); + CountDownLatch runningthreadnum = new CountDownLatch(queue.size()); + DownloadPage task[] = new DownloadPage[queuesize+1]; + for (i = 1; i <= queuesize; i++) { + if (needstop) { + int j; + for (j = 1; j < i; j++) + if (task[j] != null) task[j].needstop = true; + break; + } + str = queue.poll(); + str = str.replace(" ", ""); + if (!str.equals("")) { + if (str.indexOf("://") == -1 || str.indexOf("://") > 12) str = "http://" + str; // 若输入的网址没有http://,则补上 + set.add(str); + task[i] = new DownloadPage(str, i, b, runningthreadnum); // 新建任务 + if (useproxy) task[i].getproxyinfo(Crawler.proxyaddrjtf.getText(), Crawler.proxyportjtf.getText()); // 需要的话设置代理 + if (sendcookie) task[i].getcookiecontent(Crawler.cookiejtf.getText()); + task[i].gettext(Crawler.userdefsourcejtf.getText(), Crawler.userdeftextjtf.getText(), Crawler.continueurljtf.getText(), Crawler.saveformatjtf.getText()); // 获取输入框内容 + task[i].start(); // 任务开始 + } + else { + runningthreadnum.countDown(); // 网址为空,没有新建任务,线程数自动减一 + } + } + + try { + /* 等候全部DownloadPage线程结束 */ + while (runningthreadnum.getCount() > 0) { + sleep(200); + if (needstop) { + int j; + for (j = 1; j <= queuesize; j++) + if (task[j] != null) task[j].needstop = true; + sleep(1000); + break; + } + } + } catch (InterruptedException ex) { + System.out.println("线程中断异常"); + } + + /* 若还需要爬新的网页,则加入队列中 */ + if (b[13]) { + for (i = 1; i <= queuesize; i++) { + if (needstop) break; + String urlsavefile = "~"+i.toString()+"saveurls.txt"; + File file = new File(urlsavefile); + if (file.exists()) { + FileInputStream fis = null; + InputStreamReader isr = null; + BufferedReader br = null; + try { + fis = new FileInputStream(file); + isr = new InputStreamReader(fis); + br = new BufferedReader(isr); + String line; + String regex = Crawler.continueurljtf.getText(); + while ((line = br.readLine()) != null) { + boolean pass = false; + + /* 若在输入框中没有输入任何字符,视为继续爬任何网页,否则只爬符合那条正则表达式的内容 */ + if (regex.equals("")) { + pass = true; + } + else { + pass = Pattern.matches(regex, line); + } + if (pass) { + if (set.add(line)) { + queue.offer(line); + } + } + } + } catch (PatternSyntaxException ex) { + System.out.println("自定义正则表达式语法错误"); + } catch (FileNotFoundException ex) { + System.out.println("临时保存url的文件不存在"); + } catch (IOException ex) { + System.out.println("读取临时保存url的文件失败"); + } finally { + try { + if (br != null) br.close(); + if (isr != null) isr.close(); + if (fis != null) fis.close(); + } catch (IOException ex) { + System.out.println("关闭临时文件失败"); + } + } + file.delete(); + } // end of if(file.exists()) + } // end of for + } // end of if(b[13]) + } // end of while + + /* 工作结束时,更改三个按钮的颜色 */ + Crawler.startjb.setBackground(null); + Crawler.stopjb.setBackground(Color.LIGHT_GRAY); + Crawler.choosefilejb.setBackground(null); + } + +}