From 568e0530dbf30e11b463acc728eb82c0072b138a Mon Sep 17 00:00:00 2001
From: tongtzeho <tangzihao@pku.edu.cn>
Date: Sun, 1 Jan 2017 11:48:08 +0800
Subject: [PATCH] Add files via upload

---
 Java_Programming/src/crawler/Crawler.java     | 372 +++++++++++++++
 .../src/crawler/DownloadPage.java             | 435 ++++++++++++++++++
 Java_Programming/src/crawler/Parser.java      | 152 ++++++
 .../src/crawler/SameFileName.java             |  42 ++
 .../src/crawler/TxtFileFilter.java            |  34 ++
 Java_Programming/src/crawler/Work.java        | 201 ++++++++
 6 files changed, 1236 insertions(+)
 create mode 100644 Java_Programming/src/crawler/Crawler.java
 create mode 100644 Java_Programming/src/crawler/DownloadPage.java
 create mode 100644 Java_Programming/src/crawler/Parser.java
 create mode 100644 Java_Programming/src/crawler/SameFileName.java
 create mode 100644 Java_Programming/src/crawler/TxtFileFilter.java
 create mode 100644 Java_Programming/src/crawler/Work.java

diff --git a/Java_Programming/src/crawler/Crawler.java b/Java_Programming/src/crawler/Crawler.java
new file mode 100644
index 0000000..426ce74
--- /dev/null
+++ b/Java_Programming/src/crawler/Crawler.java
@@ -0,0 +1,372 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package crawler;
+
+import java.awt.*;
+import java.awt.event.*;
+import java.io.*;
+import javax.swing.*;
+
+/**
+ *  程序名：Crawler
+ *  作者：北大信科计算机系11级唐子豪(1100012773)、北大信科计算机系11级骆宇冲(1100012778)
+ *  编译环境：Microsoft Windows 7(64-bit)下的NetBeans IDE 7.3
+ *  源文件：Crawler.java, DownloadPage.java, Parser.java, SameFileName.java, TxtFileFilter.java, Work.java
+ *  功能：
+ *  1.多线程地连接互联网，获取页面源代码，在工作中随时可以停止或退出
+ *  2.通过正则表达式匹配，根据用户的选择可提取URL、电子邮箱、QQ号码、日期、电驴链接等信息
+ *  3.用户可自定义正则表达式，从页面源代码或在正文中提取信息
+ *  4.用户可自定义URL的正则表达式，当页面含有匹配的URL时，继续连接并提取信息
+ *  5.允许下载网页或URL指向的文件（如exe、mp3等）
+ *  6.获取网页正文（去掉源代码中的html标签和js脚本等）
+ *  7.下载并保存网页中的图片或网页中含有的自定义格式的文件
+ *  8.通过设置代理服务器连接互联网
+ *  9.请求网页时发送给定的Cookie
+ */
+
+/**
+ *  主类：Crawler
+ *  功能：画界面，捕捉按键
+ */
+public class Crawler extends JApplet {
+
+    static JFrame frame; // 界面
+    static JTextField inputurljtf, importtxtjtf, proxyaddrjtf, proxyportjtf; // 输入网址、导入文件、代理地址、代理端口的输入框
+    static JTextField userdefsourcejtf, userdeftextjtf, continueurljtf, saveformatjtf, cookiejtf; // 自定义正则表达式（两个）、继续搜索的网址、存储格式、Cookie的输入框
+    static ButtonGroup bgrp; // 两个单选框的组
+    static JRadioButton inputurljrb, importtxtjrb; // 输入网址和导入文件的单选框
+    static JButton choosefilejb, startjb, stopjb; // 浏览文件、开始、停止按钮
+    static JCheckBox arrjcb[], useproxyjcb, sendcookiejcb; // 18个功能选项、是否使用代理、是否发送Cookie的复选框
+    static JLabel proxyaddrjlb, proxyportjlb; // "地址"和"端口"提示标语
+    static Work work = null; // 启动的工作
+    
+    @Override
+    public void init() {
+        frame = new JFrame("Crawler");
+        frame.setSize(406, 446); // 窗体大小
+	frame.setLocation(350, 130); // 窗体初始位置
+        frame.setResizable(false); // 不可以改变大小
+        
+        /* 点击窗口右上角的×时退出程序 */
+        frame.addWindowListener(new WindowAdapter() {
+            @Override
+            public void windowClosing(WindowEvent arg0) {
+                deletetempfile();
+                System.out.println("退出");
+                System.exit(0);
+            }
+        });
+        
+        /* 设置输入网址的输入框 */
+        inputurljtf=new JTextField("", 8192);
+        inputurljtf.setSize(286, 20);
+        inputurljtf.setLocation(110, 10);
+        inputurljtf.setBackground(Color.WHITE);
+        frame.add(inputurljtf);
+        
+        /* 设置导入文件的输入框 */
+        importtxtjtf=new JTextField("", 8192);
+        importtxtjtf.setSize(205, 20);
+        importtxtjtf.setLocation(110, 40);
+        importtxtjtf.setBackground(Color.WHITE);
+        frame.add(importtxtjtf);
+        
+        /* 设置两个单选框和组 */
+        bgrp = new ButtonGroup();
+        inputurljrb = new JRadioButton("输入网页地址 ", true);
+        importtxtjrb = new JRadioButton("从txt导入网址", false);
+        bgrp.add(inputurljrb);
+        bgrp.add(importtxtjrb);
+        inputurljrb.setSize(110, 20);
+        importtxtjrb.setSize(110, 20);
+        inputurljrb.setLocation(0, 10);
+        importtxtjrb.setLocation(0, 40);
+        frame.add(inputurljrb);
+        frame.add(importtxtjrb);
+        
+        /* 设置浏览按钮 */
+        choosefilejb = new JButton("浏览...");
+        choosefilejb.setSize(75, 20);
+        choosefilejb.setLocation(320, 40);
+        frame.add(choosefilejb);
+        
+        arrjcb = new JCheckBox[18];
+        
+        /* 设置提取URL的复选框 */
+        arrjcb[0] = new JCheckBox("提取URL");
+        arrjcb[0].setSize(92, 20);
+        arrjcb[0].setLocation(0, 70);
+        
+        /* 设置提取电子邮箱地址的复选框 */
+        arrjcb[1] = new JCheckBox("提取电子邮箱地址");
+        arrjcb[1].setSize(132, 20);
+        arrjcb[1].setLocation(135,70);
+        
+        /* 设置提取ip地址的复选框 */
+        arrjcb[2] = new JCheckBox("提取ip地址");
+        arrjcb[2].setSize(98, 20);
+        arrjcb[2].setLocation(270, 70);
+        
+        /* 设置提取手机号码的复选框 */
+        arrjcb[3] = new JCheckBox("提取手机号码");
+        arrjcb[3].setSize(113, 20);
+        arrjcb[3].setLocation(0, 95);
+        
+        /* 设置提取电话号码的复选框 */
+        arrjcb[4] = new JCheckBox("提取电话号码");
+        arrjcb[4].setSize(113, 20);
+        arrjcb[4].setLocation(135, 95);
+        
+        /* 设置提取QQ号码的复选框 */
+        arrjcb[5] = new JCheckBox("提取QQ号码");
+        arrjcb[5].setSize(99, 20);
+        arrjcb[5].setLocation(270, 95);
+        
+        /* 设置提取身份证号码的复选框 */
+        arrjcb[6] = new JCheckBox("提取身份证号码");
+        arrjcb[6].setSize(119, 20);
+        arrjcb[6].setLocation(0, 120);
+        
+        /* 设置提取日期的复选框 */
+        arrjcb[7] = new JCheckBox("提取日期");
+        arrjcb[7].setSize(93, 20);
+        arrjcb[7].setLocation(135, 120);
+        
+        /* 设置提取时间的复选框 */
+        arrjcb[8] = new JCheckBox("提取时间");
+        arrjcb[8].setSize(93, 20);
+        arrjcb[8].setLocation(270, 120);
+        
+        /* 设置提取电驴链接的复选框 */
+        arrjcb[9] = new JCheckBox("提取电驴链接(ed2k://...)");
+        arrjcb[9].setSize(176, 20);
+        arrjcb[9].setLocation(0, 145);
+        
+        /* 设置提取迅雷链接的复选框 */
+        arrjcb[10] = new JCheckBox("提取迅雷链接(thunder://...)");
+        arrjcb[10].setSize(184, 20);
+        arrjcb[10].setLocation(186, 145);
+        
+        /* 设置从源代码提取自定义正则表达式内容的复选框和对应的输入框 */
+        arrjcb[11] = new JCheckBox("从源代码提取以下内容");
+        arrjcb[11].setSize(160, 20);
+        arrjcb[11].setLocation(0, 175);
+        userdefsourcejtf = new JTextField("", 8192);
+        userdefsourcejtf.setSize(234, 20);
+        userdefsourcejtf.setLocation(162, 175);
+        frame.add(userdefsourcejtf);
+        
+        /* 设置从正文提取自定义正则表达式内容的复选框和对应的输入框 */
+        arrjcb[12] = new JCheckBox("在正文中提取以下内容");
+        arrjcb[12].setSize(160, 20);
+        arrjcb[12].setLocation(0, 205);
+        userdeftextjtf = new JTextField("", 8192);
+        userdeftextjtf.setSize(234, 20);
+        userdeftextjtf.setLocation(162, 205);
+        frame.add(userdeftextjtf);
+        
+        /* 设置继续爬网页的复选框和对应的输入框 */
+        arrjcb[13] = new JCheckBox("继续爬以下网页");
+        arrjcb[13].setSize(119, 20);
+        arrjcb[13].setLocation(0, 235);
+        continueurljtf = new JTextField("", 8192);
+        continueurljtf.setSize(275, 20);
+        continueurljtf.setLocation(121, 235);
+        frame.add(continueurljtf);
+        
+        /* 设置保存目标的复选框 */
+        arrjcb[14] = new JCheckBox("保存目标");
+        arrjcb[14].setSize(93, 20);
+        arrjcb[14].setLocation(0, 265);
+        
+        /* 设置保存网页正文的复选框 */
+        arrjcb[15] = new JCheckBox("保存网页正文");
+        arrjcb[15].setSize(113, 20);
+        arrjcb[15].setLocation(135, 265);
+        
+        /* 设置下载网页图片的复选框 */
+        arrjcb[16] = new JCheckBox("下载网页图片");
+        arrjcb[16].setSize(113, 20);
+        arrjcb[16].setLocation(270, 265);
+        
+        /* 设置下载指定格式文件的复选框和对应的输入框 */
+        arrjcb[17] = new JCheckBox("下载以下格式的文件");
+        arrjcb[17].setSize(142, 20);
+        arrjcb[17].setLocation(0, 295);
+        saveformatjtf = new JTextField("", 8192);
+        saveformatjtf.setSize(252, 20);
+        saveformatjtf.setLocation(144, 295);
+        frame.add(saveformatjtf);
+        
+        /* 设置以上复选框的字体并显示 */
+        int i;
+        for(i = 0; i <= 17; i++) {
+            arrjcb[i].setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
+            frame.add(arrjcb[i]);
+        }
+        
+        /* 设置使用代理服务器的复选框 */
+        useproxyjcb = new JCheckBox("使用代理服务器");
+        useproxyjcb.setSize(119, 20);
+        useproxyjcb.setLocation(0, 325);
+        useproxyjcb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
+        frame.add(useproxyjcb);
+        
+        /* 设置代理地址的提示 */
+        proxyaddrjlb = new JLabel("地址:");
+        proxyaddrjlb.setSize(33, 20);
+        proxyaddrjlb.setLocation(139, 325);
+        proxyaddrjlb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
+        frame.add(proxyaddrjlb);
+        
+        /* 设置代理地址的输入框 */
+        proxyaddrjtf = new JTextField("", 256);
+        proxyaddrjtf.setSize(108, 20);
+        proxyaddrjtf.setLocation(173, 325);
+        frame.add(proxyaddrjtf);
+        
+        /* 设置代理端口的提示 */
+        proxyportjlb = new JLabel("端口:");
+        proxyportjlb.setSize(33, 20);
+        proxyportjlb.setLocation(306, 325);
+        proxyportjlb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
+        frame.add(proxyportjlb);
+        
+        /* 设置代理端口的输入框 */
+        proxyportjtf = new JTextField("", 16);
+        proxyportjtf.setSize(56, 20);
+        proxyportjtf.setLocation(340, 325);
+        frame.add(proxyportjtf);
+        
+        /* 设置发送Cookie的复选框 */
+        sendcookiejcb = new JCheckBox("发送Cookie");
+        sendcookiejcb.setSize(97, 20);
+        sendcookiejcb.setLocation(0, 355);
+        sendcookiejcb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
+        frame.add(sendcookiejcb);
+        
+        /* 设置Cookie的输入框 */
+        cookiejtf = new JTextField("", 8192);
+        cookiejtf.setSize(295, 20);
+        cookiejtf.setLocation(101, 355);
+        frame.add(cookiejtf);
+        
+        /* 设置开始按钮 */
+        startjb = new JButton("开 始 (Enter)");
+        startjb.setSize(190, 23);
+        startjb.setLocation(4, 386);
+        frame.add(startjb);
+        
+        /* 设置停止按钮 */
+        stopjb = new JButton("停 止");
+        stopjb.setSize(190, 23);
+        stopjb.setLocation(205, 386);
+        stopjb.setBackground(Color.LIGHT_GRAY);
+        frame.add(stopjb);
+        
+        frame.setLayout(null); // 取消默认布局管理器
+    }
+    
+    @Override
+    public void start() {
+        
+        /* 点击"浏览"按钮时的响应 */
+        ActionListener choosefileal = new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent ae) {
+                /* 若已开始工作则不响应 */
+                if (work == null || !work.isAlive()) {
+                    importtxtjrb.setSelected(true);
+                    
+                    /* 弹出选择文件的对话框 */
+                    JFileChooser jfc = new JFileChooser (".");
+                    jfc.setAcceptAllFileFilterUsed(false);
+                    jfc.addChoosableFileFilter(new TxtFileFilter());
+                    int result = jfc.showOpenDialog(null);
+                    if(result == JFileChooser.APPROVE_OPTION) {
+                        String path = jfc.getSelectedFile().getAbsolutePath();
+                        importtxtjtf.setText(path);
+                    }
+                }
+            }       
+        };       
+        choosefilejb.addActionListener(choosefileal);
+        
+        final Crawler crawler = this;
+        
+        /* 点击"开始"按钮时的响应 */
+        ActionListener startal = new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent ae) {
+                /* 若已开始工作则不响应 */
+                if (work == null || !work.isAlive()) {
+                    System.out.println("开始");
+                    work = new Work(); // 启动一项新工作
+                    work.start();
+                }
+            }
+        };
+        startjb.addActionListener(startal);
+        
+        /* 点击停止按钮时的响应 */
+        ActionListener stopal = new ActionListener() {
+            @Override
+            public void actionPerformed(ActionEvent ae) {
+                /* 若当前没有工作则不响应 */
+                if (work != null && work.isAlive()) {
+                    work.needstop = true; // 依次将所有子线程的needstop设为true，使各子线程尽快终止
+                    System.out.println("准备停止...");
+                    deletetempfile(); // 删除临时文件
+                    while (work.isAlive()) {}
+                    System.out.println("已停止");
+                }
+            }
+        };
+        stopjb.addActionListener(stopal);
+        
+        /* 在任意一个输入框按回车键视为点击"开始"按钮 */
+        addenterlistener(inputurljtf);
+        addenterlistener(importtxtjtf);
+        addenterlistener(userdefsourcejtf);
+        addenterlistener(userdeftextjtf);
+        addenterlistener(continueurljtf);
+        addenterlistener(saveformatjtf);    
+        addenterlistener(proxyaddrjtf);
+        addenterlistener(proxyportjtf);
+        addenterlistener(cookiejtf);
+              
+        frame.setVisible(true); // 将界面显示
+        
+    }
+    
+    void addenterlistener(JTextField jtf) {            
+        jtf.addKeyListener(new KeyAdapter() {
+            @Override
+            public void keyPressed(KeyEvent event) 
+            { 
+                if (event.getKeyText(event.getKeyCode()).compareToIgnoreCase("Enter")==0) { 
+                    startjb.doClick(); // 模拟点击"开始"按钮
+                } 
+            }
+        });
+    }
+    
+    /* 删除所有临时文件 */
+    void deletetempfile() {
+        Integer i;
+        for (i = 1; i <= 500; i++) {
+            new File("~tmp"+i.toString()).delete();
+            new File("~"+i.toString()+"saveurls.txt").delete();
+        }
+    }
+    
+    /* 主函数 */
+    public static void main(String[] args) {
+        JApplet applet = new Crawler();
+        System.out.println("欢迎使用java版网络爬虫Crawler\n作者：北大信科 - 唐子豪&骆宇冲");
+        applet.init();
+        applet.start();
+    }
+}
diff --git a/Java_Programming/src/crawler/DownloadPage.java b/Java_Programming/src/crawler/DownloadPage.java
new file mode 100644
index 0000000..d846cc7
--- /dev/null
+++ b/Java_Programming/src/crawler/DownloadPage.java
@@ -0,0 +1,435 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package crawler;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.regex.*;
+
+/**
+ *
+ * @author 1100012773, 1100012778
+ */
+
+/**
+ *  DownloadPage类
+ *  功能：给定URL等信息，连接互联网下载信息，交由Parser类提取数据，如果用户有需要则保存相应内容
+ */
+public class DownloadPage extends Thread {
+    private Integer tempfileid; // 临时文件对应的序号
+    private String url; // 网页链接
+    private String tempfilename; // 临时文件名
+    private String objectname; // 目标名称（如果URL对应的是网页则取网页标题，否则直接从URL中提取）
+    private String userdefsource; // 用户自定义的正则表达式（对应从源代码中匹配）
+    private String userdeftext; // 用户自定义的正则表达式（对应从正文中匹配）
+    private String continueurl; // 要继续爬的URL的正则表达式
+    private String saveformat; // 要存储的文件格式
+    private boolean useproxy = false, sendcookie = false; // 是否使用代理服务器、是否发送Cookie
+    private String proxyaddr; // 代理地址
+    private String cookie; // Cookie内容
+    private int proxyport; // 代理端口
+    private boolean need[]; // 对应18个复选框的真值
+    private boolean error = false; // 是否出错
+    private CountDownLatch runningthreadnum; // 当前线程数
+    boolean needstop; // 是否需要停止（由Work控制）
+    
+    /* 构造函数 */
+    public DownloadPage(String u, Integer tfi, boolean n[], CountDownLatch rtn) {
+        super();
+        url = u;
+        tempfileid = tfi;
+        tempfilename = "~tmp"+tfi.toString();
+        need = new boolean[18];
+        int i;
+        for (i = 0 ;i <= 17; i++)
+            need[i] = n[i];
+        runningthreadnum = rtn;
+        needstop = false;
+    }
+    
+    /* 获取代理服务器信息 */
+    public void getproxyinfo(String paddr, String pp) {
+        useproxy = true;
+        proxyaddr = paddr;
+        proxyport = Integer.valueOf(pp);
+    }
+    
+    /* 获取Cookie内容 */
+    public void getcookiecontent(String cookie) {
+        sendcookie = true;
+        this.cookie = cookie;
+    }
+    
+    
+    /* 获取输入框内容 */
+    public void gettext(String userdefsource, String userdeftext, String continueurl, String saveformat) {
+        this.userdefsource = userdefsource;
+        this.userdeftext = userdeftext;
+        this.continueurl = continueurl;
+        this.saveformat = saveformat;
+    }
+    
+    @Override
+    public void run() {
+        try {
+            System.out.println("准备连接 - "+url);
+            String charcode = null;
+            if (!needstop) charcode = getcharcodefromurl(); // 分析网页编码类型
+            String content = null;
+            if (!needstop) content = getcontentfromurl(charcode); // 获取网页内容
+            if (!needstop) outputtotempfile(content, charcode); // 将网页内容输出到临时文件中
+            if (!error && !needstop) {
+                Parser parser = new Parser(url, tempfileid, content, need);
+                String title = parser.gettitle(); // 获取网页标题
+                if (title.equals("")) System.out.println("连接成功 - "+url);
+                else System.out.println("连接成功 - "+title);
+                parser.parse(userdefsource, userdeftext, continueurl, saveformat); // 分析网页内容并提取需要的数据
+                if (title.equals("")) System.out.println("提取信息成功 - "+url);
+                else System.out.println("提取信息成功 - "+title);
+                
+                /* 若需要保存目标…… */
+                if (need[14] && !needstop) {
+                    /* 该目标为网页且有非空标题 */
+                    if (!title.equals("")) {
+                        File file = new File(tempfilename);
+                        if (file.exists()) {
+                            new File("下载\\目标").mkdirs();
+                            System.out.println("目标保存为 - "+SameFileName.newfilename("下载\\目标\\", title+".html"));
+                            file.renameTo(new File("下载\\目标\\"+SameFileName.newfilename("下载\\目标\\", title+".html"))); // 将之前的临时文件重命名即可
+                        }
+                    }
+                    /* 该目标不是网页或是网页但没有非空标题 */
+                    else {
+                        objectname = SameFileName.newfilename("下载\\目标\\", getobjname(url));
+                        new File("下载\\目标").mkdirs();
+                        System.out.println("目标保存为 - "+objectname);
+                        downloadbybyte(url, "下载\\目标\\"+objectname); // 重新下载（因为之前的下载很可能会丢失部分特殊字符的数据）
+                    }
+                }
+                
+                /* 若需要保存网页正文…… */
+                if (need[15] && !needstop) {
+                    String textname = null;
+                    if (!title.equals("")) {
+                        textname = title+".txt";
+                    }
+                    else {
+                        textname = getobjname(url)+".txt";
+                    }
+                    new File("下载\\网页正文").mkdirs();
+                    textname = SameFileName.newfilename(("下载\\网页正文\\"), textname);
+                    System.out.println("网页正文保存为 - "+textname);
+                    outputtext(content, charcode, "下载\\网页正文\\"+textname); // 将网页正文输出到指定的文件
+                }
+                
+                /* 若需要下载网页中特定格式的文件…… */
+                if (need[16] || need[17]) {
+                    Set<String> filesuffixset = new HashSet<String>();
+                    /* 需要下载图片 */
+                    if (need[16]) {
+                        filesuffixset.add(".jpg");
+                        filesuffixset.add(".gif");
+                        filesuffixset.add(".png");
+                        filesuffixset.add(".jpeg");
+                        filesuffixset.add(".bmp");
+                    }
+                    /* 需要下载自定义格式的文件 */
+                    if (need[17]) {
+                        Pattern pattern = Pattern.compile("\\.(\\w|\\-|\\_)+", Pattern.CASE_INSENSITIVE);
+                        Matcher matcher = pattern.matcher(saveformat);
+                        while (matcher.find()) {
+                            filesuffixset.add(saveformat.substring(matcher.start(), matcher.end()).toLowerCase());
+                        }
+                    }
+                    if (!filesuffixset.isEmpty() && !needstop) downloadfile(content, "下载 - "+title, filesuffixset); // 下载文件
+                }
+                
+            }
+        } catch (MalformedURLException ex) {
+            System.out.println("url格式不对 - "+url); // url格式不对
+            error = true;
+        } catch (IOException ex) {
+            System.out.println("网络连接异常 - "+url); // 网络连接异常
+            error = true;
+        }
+
+        File file = new File(tempfilename);
+        if (file.exists()) file.delete(); // 删除临时文件
+        runningthreadnum.countDown(); // 线程数减一
+    }
+    
+    /* 获取网页编码类型 */
+    private String getcharcodefromurl() {
+        try {
+            /* 若需要使用代理服务器则设置代理 */
+            SocketAddress add = null;
+            if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); 
+            Proxy proxy = null;
+            if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add);
+            
+            /* 连接网络，获取网页头信息 */
+            URL u = new URL(url);
+            HttpURLConnection urlconnection = null;
+            if (useproxy) urlconnection = (HttpURLConnection)u.openConnection(proxy);
+            else urlconnection = (HttpURLConnection)u.openConnection();
+            
+            if (sendcookie) urlconnection.setRequestProperty("Cookie", cookie);
+            urlconnection.connect();
+            
+            String charcode = null;
+            
+            /* 分析网页头信息 */
+            Map<String, List<String>> map = urlconnection.getHeaderFields();   
+            Set<String> keys = map.keySet();   
+            Iterator<String> iterator = keys.iterator();
+        
+            String key = null;   
+            String tmp = null;   
+            while (iterator.hasNext()) {
+                if (needstop) return "UTF-8";
+                key = iterator.next();   
+                tmp = map.get(key).toString().toLowerCase();   
+                
+                /* 若网页头中含有"Content-Type"项且含有"charset="字段，则提取信息并返回 */
+                if (key != null && key.equals("Content-Type")) {   
+                    int m = tmp.indexOf("charset=");   
+                    if (m != -1) {   
+                        charcode = tmp.substring(m + 8).replace("]", "");   
+                        return charcode;   
+                    }   
+                }   
+            }
+        
+            if (needstop) return "UTF-8";
+            
+            /* 重新连接，逐行提取网页源代码，再从源代码中寻找字符编码信息 */
+            HttpURLConnection conn = null;
+            if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy));
+            else conn = (HttpURLConnection)(new URL(url).openConnection());
+            
+            if (sendcookie) conn.setRequestProperty("Cookie", cookie);
+            conn.connect();
+            
+            BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
+            StringBuilder sb = new StringBuilder();
+            String line;
+            
+            while ((line = reader.readLine()) != null) {
+                if (needstop) return "UTF-8";
+                line = line.toLowerCase();
+                /* 在读取的字符串中寻找"charset="字段 */
+                int indexofcharset = line.indexOf("charset=");
+                if (indexofcharset > 0) {
+                    line = line.substring(indexofcharset);
+                    int indexofquotation = line.indexOf("\"");
+                    if (indexofquotation > 0) {
+                        return line.substring(8, indexofquotation);
+                    }
+                }
+            }
+        } catch (MalformedURLException ex) {}
+          catch (IOException ex) {}
+
+        return "UTF-8"; // 默认是UTF-8编码
+    }
+    
+    /* 获取网页内容 */
+    private String getcontentfromurl(String charcode) {
+        try {
+            /* 若需要使用代理服务器则先设置代理 */
+            SocketAddress add = null;
+            if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); 
+            Proxy proxy = null;
+            if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add);
+            
+            /* 连接网络 */
+            HttpURLConnection conn = null;
+            if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy));
+            else conn = (HttpURLConnection)(new URL(url).openConnection());
+            
+            if (sendcookie) conn.setRequestProperty("Cookie", cookie);
+            conn.connect();
+            
+            InputStream is = conn.getInputStream();
+            if (needstop) return "";
+            String content = readfromstream(is, charcode); // 在InputStream中获取内容
+            return content;
+        } catch(MalformedURLException e) {
+            System.out.println("url格式不对 - "+url);
+            error = true;
+        } catch (IOException ex) {
+            System.out.println("网络连接异常 - "+url);
+            error = true;
+        }
+        return "";
+    }
+    
+    /* 从网页给的输入流中获取内容并保存在String中 */
+    private String readfromstream(InputStream stream, String charcode) throws IOException {
+        try {
+            BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charcode));
+            StringBuilder sb = new StringBuilder();
+            String line;
+            
+            /* 逐行读取数据 */
+            while ((line = reader.readLine()) != null) {
+                if (needstop) return "";
+                sb.append(line+"\r\n");
+            }
+            return sb.toString();
+        } catch (UnsupportedEncodingException ex) {
+            System.out.println("无法识别的编码方式 - "+url);
+            error = true;
+        }
+        return "";
+    }
+    
+    /* 将网页内容输出到临时文件 */
+    private void outputtotempfile(String content, String charcode) {
+        File file = new File(tempfilename);
+        FileOutputStream fos = null;
+        Writer out = null;
+        try {
+            fos = new FileOutputStream(file, false);
+            out = new OutputStreamWriter(fos, charcode);
+            out.write(content);
+        } catch (FileNotFoundException ex) {
+            System.out.println("临时文件名出错");
+            error = true;
+        } catch (IOException ex) {
+            System.out.println("临时文件输出失败");
+            error = true;
+        } finally {
+            try {
+                if (out != null) out.close();
+                if (fos != null) fos.close();
+            } catch (IOException ex) {
+                System.out.println("无法关闭临时文件");
+            }
+        }
+    }
+    
+    /* 逐字节地下载网页目标 */
+    private boolean downloadbybyte(String url, String savefile) throws MalformedURLException {
+        boolean succeed = true;
+        URL u = new URL(url);
+        DataInputStream dis = null;
+        FileOutputStream fos = null;
+        try {
+            if (needstop) return false;
+            /* 设置代理、连网等同上 */
+            SocketAddress add = null;
+            if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); 
+            Proxy proxy = null;
+            if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add);
+            HttpURLConnection conn = null;
+            if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy));
+            else conn = (HttpURLConnection)(new URL(url).openConnection());
+            
+            if (sendcookie) conn.setRequestProperty("Cookie", cookie);
+            conn.connect();
+            
+            dis = new DataInputStream(conn.getInputStream());
+            if (needstop) return false;
+            fos = new FileOutputStream(new File(savefile));
+            byte buffer[] = new byte[65536];
+            int length;
+            
+            /* 每次固定读若干字节的内容 */
+            while ((length = dis.read(buffer)) > 0) {
+                fos.write(buffer, 0, length);
+                if (needstop) return false;
+            }
+        } catch (IOException ex) {
+            System.out.println("下载失败 - "+savefile.substring(savefile.lastIndexOf("\\")+1));
+            succeed = false;
+        } finally {
+            try {
+                if (dis != null) dis.close();
+                if (fos != null) fos.close();
+            } catch (IOException ex) {
+                System.out.println("关闭下载的文件失败");
+            }
+        }
+        return succeed;
+    }
+    
+    /* 根据URL获取目标名称 */
+    private String getobjname(String url) {
+        if (needstop) return url;
+        String destfile = new String(url);
+        if (destfile.endsWith("/")) destfile = destfile.substring(0, destfile.length() - 1);
+        int lastslashpos = destfile.lastIndexOf("/");
+        destfile = destfile.substring(lastslashpos + 1);
+        int lastdotpos = destfile.lastIndexOf(".");
+        if (lastdotpos > 0) {
+            int tmpindex = destfile.indexOf("?", lastdotpos);
+            if (tmpindex > 0) destfile = destfile.substring(0, tmpindex);
+            if ((tmpindex = destfile.indexOf("%", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
+            if ((tmpindex = destfile.indexOf("&", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
+            if ((tmpindex = destfile.indexOf("=", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
+            if ((tmpindex = destfile.indexOf("+", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
+            if ((tmpindex = destfile.indexOf(":", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);          
+        }
+        destfile = destfile.replace(":", "").replace("<","").replace(">","").replace("?","").replace("|","").replace("*","").replace("/","").replace("\\","").replace("\"", "");
+        if (destfile.length() > 127) destfile = destfile.substring(destfile.length() - 127);
+        return destfile;
+    }
+    
+    /* 输入网页正文 */
+    private void outputtext(String content, String charcode, String savefile) {
+        if (needstop) return;
+        String text = new String(content);
+        /* 删去所有js脚本、html标签，并将&***;变为原有字符 */
+        text = text.replaceAll("<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)[^>]*?>[\\s\\S]*?</(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)>", "").replaceAll("<(s|S)(t|T)(y|Y)(l|L)(e|E)[^>]*?>[\\s\\S]*?</(s|S)(t|T)(y|Y)(l|L)(e|E)>","").replaceAll("<br>","\r\n").replaceAll("<[\\s\\S]*?>", "").replaceAll("(\r\n)+","\r\n").replaceAll("(\\s)+"," ").replaceAll("\\&((nbsp)|(\\#12288)|(\\#160))(\\;)?", " ").replaceAll("\\&((lt)|(\\#60))(\\;)?","<").replaceAll("\\&((gt)|(\\#62))(\\;)?",">").replaceAll("\\&((quot)|(#34))(\\;)?","\"").replaceAll("\\&((apos)|(\\#39))(\\;)?","'").replaceAll("\\&copy(\\;)?","©").replaceAll("\\&reg(\\;)?","®").replaceAll("\\&((amp)|(#38))(\\;)?","&");
+        File file = new File(savefile);
+        FileOutputStream fos = null;
+        Writer out = null;
+        try {
+            fos = new FileOutputStream(file, false);
+            out = new OutputStreamWriter(fos, charcode);
+            if (needstop) return;
+            out.write(text);
+        } catch (FileNotFoundException ex) {
+            System.out.println("网页正文文件名出错");
+            error = true;
+        } catch (IOException ex) {
+            System.out.println("网页正文输出失败");
+            error = true;
+        } finally {
+            try {
+                if (out != null) out.close();
+                if (fos != null) fos.close();
+            } catch (IOException ex) {
+                System.out.println("无法关闭网页正文文件");
+            }
+        }        
+    }
+    
+    /* 下载指定格式的文件 */
+    void downloadfile(String content, String dir, Set<String> suffixset) throws MalformedURLException, IOException {
+        Set<String> downloaded = new HashSet<String>();
+        Pattern pattern = Pattern.compile("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?", Pattern.CASE_INSENSITIVE);
+        Matcher matcher = pattern.matcher(content);
+        
+        /* 从网页源码中提取指定格式的URL */
+        while (matcher.find()) {
+            if (needstop) return;
+            String url = content.substring(matcher.start(), matcher.end());
+            String destfile = getobjname(url);
+            destfile = SameFileName.newfilename("下载\\"+dir+"\\", destfile);
+            int lastdotpos = destfile.lastIndexOf(".");
+            if (lastdotpos > 0) {
+                if (suffixset.contains(destfile.substring(lastdotpos).toLowerCase()) && downloaded.add(url.toLowerCase())) {
+                    new File("下载\\"+dir).mkdirs();
+                    if (downloadbybyte(url, "下载\\"+dir+"\\"+destfile)) System.out.println("下载成功 - "+destfile); /* 下载该URL */
+                }
+            }
+        }
+    }
+    
+}
diff --git a/Java_Programming/src/crawler/Parser.java b/Java_Programming/src/crawler/Parser.java
new file mode 100644
index 0000000..bc41a62
--- /dev/null
+++ b/Java_Programming/src/crawler/Parser.java
@@ -0,0 +1,152 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package crawler;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+
+/**
+ *
+ * @author 1100012773, 1100012778
+ */
+
+/**
+ *  Parser类
+ *  功能：分析网页源文件，提取需要的信息并输出
+ */
+public class Parser {
+    
+    private String url; // 网页链接
+    private String tempfilename; // 输出的临时文件名
+    private String content; // 网页源代码
+    private boolean need[]; // 对应DownloadPage的need
+    
+    /* 构造函数 */
+    public Parser(String u, Integer tfi, String ct, boolean n[]) {
+        url = u;
+        tempfilename = "~"+tfi.toString()+"saveurls.txt";
+        content = ct;
+        int i;
+        need = new boolean[18];
+        for (i = 0; i < 17; i++)
+            need[i] = n[i];
+    }
+    
+    /* 分析源代码内容，提取需要的信息并输出 */
+    public void parse(String userdefsource, String userdeftext, String continueurl, String saveformat) {
+        String title = gettitle();
+        try {
+            String text = content.replace("\n","").replace("\r","").replaceAll("<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)[^>]*?>.*?</(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)>", "").replaceAll("<(s|S)(t|T)(y|Y)(l|L)(e|E)[^>]*?>.*?</(s|S)(t|T)(y|Y)(l|L)(e|E)>","").replaceAll("<.*?>", "").replaceAll("\\&((nbsp)|(\\#12288)|(\\#160))(\\;)?", " ").replaceAll("\\&((lt)|(\\#60))(\\;)?","<").replaceAll("\\&((gt)|(\\#62))(\\;)?",">").replaceAll("\\&((quot)|(#34))(\\;)?","\"").replaceAll("\\&((apos)|(\\#39))(\\;)?","'").replaceAll("\\&copy(\\;)?","©").replaceAll("\\&reg(\\;)?","®").replaceAll("\\&((amp)|(#38))(\\;)?","&");
+            
+            /* regex[0]到regex[10]为URL、电子邮箱地址、ip地址、手机号码、电话号码、QQ号码、身份证号码、日期、时间、电驴链接、迅雷链接的正则表达式 */
+            String regex[] = new String[18];
+            regex[0] = new String("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?");
+            regex[1] = new String("[\\w]+([\\.\\_\\-]*[\\w])*\\@([\\w]+[\\w\\-]*[\\w]+\\.)+[\\w]+");
+            regex[2] = new String("(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))");
+            regex[3] = new String("((\\+)?86(\\-)?)?(1)(((3|5|8)[0-9])|(47))[0-9]{8}");
+            regex[4] = new String("(((0?)|[1-9])(0?|[1-9])([0-9][0-9])\\-)?[1-9]([0-9]{6})[0-9]?((\\-)[0-9]{1,4})?");
+            regex[5] = new String("[1-9][0-9]{4,9}");
+            regex[6] = new String("(([1-5][1-9])|6[1-5]|(71)|(81)|(82))([0-9]{4})((18)|(19)|(20))([0-9]{2})((0[1-9])|(11)|(12))(([0-2][0-9])|30|31)[0-9]{3}([0-9]|x|X)");
+            regex[7] = new String("([0-9]{2,4}(\\-)((0?[1-9])|(10)|(11)|(12))(\\-)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|([0-9]{2,4}(\\.)((0?[1-9])|(10)|(11)|(12))(\\.)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|([0-9]{2,4}(\\/)((0?[1-9])|(10)|(11)|(12))(\\/)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\-((0?[1-9])|(10)|(11)|(12))\\-([0-9]{2,4}))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\.((0?[1-9])|(10)|(11)|(12))\\.([0-9]{2,4}))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\/((0?[1-9])|(10)|(11)|(12))\\/([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\-(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\-([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\.(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\.([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\/(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\/([0-9]{2,4}))");
+            regex[8] = new String("(((1[0-9])|(2[0-3])|(0?[0-9]))\\:([0-5][0-9])(\\:[0-5][0-9])?)|(24\\:00(\\:00)?)");
+            regex[9] = new String("ed2k://\\|file\\|[\\w\\-\\%\\(\\)\\[\\]\\.\\!]*[\\w]+\\|[0-9]+\\|[0-9A-F]+\\|((p|h)\\=[0-9A-Z]+(\\|)?)?(\\/)?");
+            regex[10] = new String("thunder://[\\w\\+\\/\\=]+");
+            regex[11] = userdefsource; // 用户自定义的正则表达式1
+            regex[12] = userdeftext; // 用户自定义的正则表达式2
+            regex[13] = new String("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?"); // 同为URL正则表达式
+            
+            /* 输出文件的前缀以及保存的文件夹的名称 */
+            String filetitleprefix[] = new String[18];
+            filetitleprefix[0] = new String("URL");
+            filetitleprefix[1] = new String("电子邮箱地址");
+            filetitleprefix[2] = new String("ip地址");
+            filetitleprefix[3] = new String("手机号码");
+            filetitleprefix[4] = new String("电话号码");
+            filetitleprefix[5] = new String("QQ号码");
+            filetitleprefix[6] = new String("身份证号码");
+            filetitleprefix[7] = new String("日期");
+            filetitleprefix[8] = new String("时间");
+            filetitleprefix[9] = new String("电驴链接");
+            filetitleprefix[10] = new String("迅雷链接");
+            filetitleprefix[11] = new String("自定义从源代码提取");
+            filetitleprefix[12] = new String("自定义在正文中提取");
+            
+            int i;
+            for (i = 0; i <= 13; i++) {
+                if (need[i]) {
+                    Pattern pattern = Pattern.compile(regex[i], Pattern.CASE_INSENSITIVE);
+                    String matchstr;
+                    if ((i >= 3 && i <= 6) || (i >= 9 && i <= 10) || (i == 12)) matchstr = text; // 手机号码、电话号码、QQ号码、身份证号码、电驴链接、迅雷链接、自定义1在正文中提取
+                    else matchstr = content; // 其余从源代码中提取
+                    Matcher matcher = pattern.matcher(matchstr);
+                    Set<String> set = new HashSet<String>();
+                    while (matcher.find()) {
+                        String newitem = matchstr.substring(matcher.start(), matcher.end());
+                        set.add(newitem);                   
+                    }
+                    
+                    /* 额外分析形如"<a href="的相对链接 */
+                    if (i == 0 || i == 13) {
+                        Pattern patternrelativelink = Pattern.compile("\\<a\\shref\\=\"\\/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*", Pattern.CASE_INSENSITIVE);
+                        Matcher matcherrelativelink = patternrelativelink.matcher(content);
+                        while (matcherrelativelink.find()) {
+                            String newitem = content.substring(matcherrelativelink.start(), matcherrelativelink.end());
+                            int slashpos = newitem.indexOf("/");
+                            newitem = newitem.substring(slashpos);
+                            slashpos = url.indexOf("://");
+                            slashpos = url.indexOf("/", slashpos+5);
+                            if (slashpos == -1) newitem = url+newitem;
+                            else newitem = url.substring(0, slashpos)+newitem;
+                            set.add(newitem);
+                        }
+                    }
+                    
+                    if (i != 13) new File(filetitleprefix[i]).mkdir();
+                    File file;
+                    if (i != 13) file = new File(filetitleprefix[i]+"\\"+SameFileName.newfilename(filetitleprefix[i]+"\\", filetitleprefix[i]+" - "+title+".txt"));
+                    else file = new File(tempfilename);
+                    
+                    /* 输出提取的结果(i!=13)或临时文件(i==13) */
+                    OutputStream os = null;
+                    FileOutputStream fos = new FileOutputStream(file, false);
+                    Writer out = new OutputStreamWriter(fos, "UTF-8");
+                    Iterator<String> iter = set.iterator();
+                    while (iter.hasNext()) {
+                        out.write(iter.next()+"\r\n");
+                    }
+                    out.close();
+                    fos.close();
+                }
+            }
+            
+        } catch (PatternSyntaxException ex) {
+            System.out.println("自定义正则表达式语法错误");
+        } catch (FileNotFoundException ex) {
+            System.out.println("输出失败");
+        } catch (UnsupportedEncodingException ex) {
+            System.out.println("无法识别的编码方式");
+        } catch (IOException ex) {
+            System.out.println("输出失败");
+        }
+    }
+    
+    /* 从源代码的<title>...</title>段中获取网页标题 */
+    public String gettitle() {
+        String contentwithoutline = content;
+        contentwithoutline = contentwithoutline.replace("\n", "").replace("\r", "").replaceAll("[\\s]+"," ");
+        String regex = "<title>[\\s\\S]*?</title>";
+        String title = "";
+        Pattern pattern = Pattern.compile(regex, Pattern.CANON_EQ);
+        Matcher matcher = pattern.matcher(contentwithoutline);
+        if (matcher.find()) {
+            title = contentwithoutline.substring(matcher.start(), matcher.end()).replaceAll("<.*?>", "").replace(":", "").replace("<","").replace(">","").replace("?","").replace("|","").replace("*","").replace("/","").replace("\\","").replace("\"", "");
+            if (title.startsWith(" ")) title = title.substring(1);
+            return title;
+        }
+        return "";
+    }
+    
+}
diff --git a/Java_Programming/src/crawler/SameFileName.java b/Java_Programming/src/crawler/SameFileName.java
new file mode 100644
index 0000000..02fef2b
--- /dev/null
+++ b/Java_Programming/src/crawler/SameFileName.java
@@ -0,0 +1,42 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package crawler;
+
+import java.io.*;
+/**
+ *
+ * @author 1100012773, 1100012778
+ */
+
+/**
+ *  SameFileName类
+ *  若某个目录下已存在某文件，更改新文件的文件名（如a.txt->a[2].txt，ab[3]->ab[4]）
+ */
+public class SameFileName {
+    
+    public static String newfilename(String dir, String oldfilename) {
+        String filename = oldfilename;
+        File file = new File(dir+oldfilename);
+        if (!file.exists()) return filename;
+        int lastdotpos = oldfilename.lastIndexOf(".");
+        Integer index = 1;
+        if (lastdotpos == -1) {
+            while (true) {
+                index++;
+                filename = oldfilename+"["+index.toString()+"]";
+                if (!new File(dir+filename).exists()) return filename;
+            }
+        }
+        else {
+            String suffix = oldfilename.substring(lastdotpos);
+            while (true) {
+                index++;
+                filename = oldfilename.substring(0, lastdotpos)+"["+index.toString()+"]"+suffix;
+                if (!new File(dir+filename).exists()) return filename;
+            }
+        }
+    }
+    
+}
diff --git a/Java_Programming/src/crawler/TxtFileFilter.java b/Java_Programming/src/crawler/TxtFileFilter.java
new file mode 100644
index 0000000..77cc8ad
--- /dev/null
+++ b/Java_Programming/src/crawler/TxtFileFilter.java
@@ -0,0 +1,34 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package crawler;
+
+import java.io.File;
+import javax.swing.filechooser.*;
+
+/**
+ *
+ * @author 1100012773, 1100012778
+ */
+
+/**
+ *  TxtFileFilter类
+ *  文件筛选器，只用于点击"浏览"按钮弹出的选择文件的对话框
+ */
+class TxtFileFilter extends FileFilter {
+    
+    @Override
+    public boolean accept(File f) {
+        if(f.isDirectory()) {
+            return true; // 显示文件夹
+        }
+        String nameString = f.getName();
+        return nameString.toLowerCase().endsWith(".txt"); // 显示txt文件
+    }
+    
+    @Override
+    public String getDescription() {
+        return "文本文件 (*.txt)"; // 类型提示
+    }
+}
diff --git a/Java_Programming/src/crawler/Work.java b/Java_Programming/src/crawler/Work.java
new file mode 100644
index 0000000..c0ec2bc
--- /dev/null
+++ b/Java_Programming/src/crawler/Work.java
@@ -0,0 +1,201 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package crawler;
+
+import java.awt.Color;
+import java.util.*;
+import java.io.*;
+import java.util.concurrent.*;
+import java.util.regex.*;
+
+/**
+ *
+ * @author 1100012773, 1100012778
+ */
+
+/**
+ *  Work类
+ *  网络爬虫的工作类，启动DownloadPage线程，并保存要连接的URL的队列
+ */
+public class Work extends Thread {
+
+    boolean needstop; // 是否需要停止（由Crawler控制）
+    
+    /* 构造函数 */
+    public Work() {
+        needstop = false;
+    }
+    
+    @Override
+    public void run() {
+        /* 开始工作时，更改三个按钮的颜色 */
+        Crawler.startjb.setBackground(Color.LIGHT_GRAY);
+        Crawler.stopjb.setBackground(null);
+        Crawler.choosefilejb.setBackground(Color.LIGHT_GRAY);
+        
+        Set<String> set = new HashSet<String>(); // 记录连接的URL
+        Queue<String> queue = new LinkedList<String>(); // 记录即将连接的URL
+        
+        /* 直接输入网址 */
+        if(Crawler.inputurljrb.isSelected()) {
+            queue.offer(Crawler.inputurljtf.getText());
+        }
+        
+        /* 从txt文件中导入网址 */
+        else {
+            String importfilename = Crawler.importtxtjtf.getText();
+            FileInputStream fis = null;
+            InputStreamReader isr = null;
+            BufferedReader br = null;
+            try {
+                String readstr;
+                fis = new FileInputStream(importfilename);
+                isr = new InputStreamReader(fis);
+                br = new BufferedReader(isr);
+                while ((readstr = br.readLine()) != null) {
+                    queue.offer(readstr);
+                }
+            } catch(FileNotFoundException e) {
+                System.out.println("找不到指定文件");
+            } catch(IOException e) {
+                System.out.println("读取文件失败");
+            } finally {
+                try {
+                    if (br != null) br.close();
+                    if (isr != null) isr.close();
+                    if (fis != null) fis.close();
+                } catch (IOException ex) {
+                    System.out.println("关闭文件失败");
+                }
+            }
+        }
+        
+        /* 判断是否需要代理服务器，若需要则读取代理服务器的地址和端口 */
+        boolean useproxy = Crawler.useproxyjcb.isSelected();
+        if (useproxy) {
+            if (!Pattern.matches("(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))", Crawler.proxyaddrjtf.getText())) {
+                useproxy = false;
+                System.out.println("代理服务器地址格式错误");
+            }
+            else if (!Pattern.matches(("[0-9]{1,5}"), Crawler.proxyportjtf.getText())) {
+                useproxy = false;
+                System.out.println("代理服务器端口格式错误");
+            }
+        }
+        
+        /* 是否需要发送Cookie */
+        boolean sendcookie = Crawler.sendcookiejcb.isSelected();
+        
+        /* 直到即将连接的URL为空才结束循环 */
+        while (!queue.isEmpty()) {
+            if (needstop) break;
+            String str;
+            int queuesize = queue.size();
+            if (queuesize > 500) queuesize = 500; // 设置DownloadPage线程数最多为500
+            Integer i;
+            boolean b[] = new boolean[18];
+            for (i = 0; i <= 17; i++)
+                b[i] = Crawler.arrjcb[i].isSelected();
+            CountDownLatch runningthreadnum = new CountDownLatch(queue.size());
+            DownloadPage task[] = new DownloadPage[queuesize+1];
+            for (i = 1; i <= queuesize; i++) {
+                if (needstop) {
+                    int j;
+                    for (j = 1; j < i; j++)
+                        if (task[j] != null) task[j].needstop = true;
+                    break;
+                }
+                str = queue.poll();
+                str = str.replace(" ", "");
+                if (!str.equals("")) {
+                    if (str.indexOf("://") == -1 || str.indexOf("://") > 12) str = "http://" + str; // 若输入的网址没有http://，则补上
+                    set.add(str);
+                    task[i] = new DownloadPage(str, i, b, runningthreadnum); // 新建任务
+                    if (useproxy) task[i].getproxyinfo(Crawler.proxyaddrjtf.getText(), Crawler.proxyportjtf.getText()); // 需要的话设置代理
+                    if (sendcookie) task[i].getcookiecontent(Crawler.cookiejtf.getText());
+                    task[i].gettext(Crawler.userdefsourcejtf.getText(), Crawler.userdeftextjtf.getText(), Crawler.continueurljtf.getText(), Crawler.saveformatjtf.getText()); // 获取输入框内容
+                    task[i].start(); // 任务开始
+                }
+                else {
+                    runningthreadnum.countDown(); // 网址为空，没有新建任务，线程数自动减一
+                }          
+            }
+            
+            try {
+                /* 等候全部DownloadPage线程结束 */
+                while (runningthreadnum.getCount() > 0) {
+                    sleep(200);
+                    if (needstop) {
+                        int j;
+                        for (j = 1; j <= queuesize; j++)
+                            if (task[j] != null) task[j].needstop = true;
+                        sleep(1000);
+                        break;
+                    }
+                }
+            } catch (InterruptedException ex) {
+                System.out.println("线程中断异常");
+            }
+            
+            /* 若还需要爬新的网页，则加入队列中 */
+            if (b[13]) {
+                for (i = 1; i <= queuesize; i++) {
+                    if (needstop) break;
+                    String urlsavefile = "~"+i.toString()+"saveurls.txt";
+                    File file = new File(urlsavefile);
+                    if (file.exists()) {
+                        FileInputStream fis = null;
+                        InputStreamReader isr = null;
+                        BufferedReader br = null;
+                        try {
+                            fis = new FileInputStream(file);
+                            isr = new InputStreamReader(fis);
+                            br = new BufferedReader(isr);
+                            String line;
+                            String regex = Crawler.continueurljtf.getText();
+                            while ((line = br.readLine()) != null) {
+                                boolean pass = false;
+                                
+                                /* 若在输入框中没有输入任何字符，视为继续爬任何网页，否则只爬符合那条正则表达式的内容 */
+                                if (regex.equals("")) {
+                                    pass = true;
+                                }
+                                else {
+                                    pass = Pattern.matches(regex, line);
+                                }
+                                if (pass) {
+                                    if (set.add(line)) {
+                                        queue.offer(line);
+                                    }
+                                }
+                            }
+                        } catch (PatternSyntaxException ex) {
+                            System.out.println("自定义正则表达式语法错误");
+                        } catch (FileNotFoundException ex) {
+                            System.out.println("临时保存url的文件不存在");
+                        } catch (IOException ex) {
+                            System.out.println("读取临时保存url的文件失败");
+                        } finally {
+                            try {
+                                if (br != null) br.close();
+                                if (isr != null) isr.close();
+                                if (fis != null) fis.close();
+                            } catch (IOException ex) {
+                            System.out.println("关闭临时文件失败");
+                            }
+                        }
+                        file.delete();
+                    } // end of if(file.exists())
+                } // end of for
+            } // end of if(b[13])
+        } // end of while
+        
+        /* 工作结束时，更改三个按钮的颜色 */
+        Crawler.startjb.setBackground(null);
+        Crawler.stopjb.setBackground(Color.LIGHT_GRAY);
+        Crawler.choosefilejb.setBackground(null);
+    }
+    
+}