From b55e86c40530581cac07df709fe7088874db0f5a Mon Sep 17 00:00:00 2001 From: william Date: Wed, 24 Aug 2016 18:48:12 -0400 Subject: [PATCH] Nutch-Ajax-Solr --- README.md | 77 + apache-nutch-2.3/.classpath | 73 + apache-nutch-2.3/.gitignore | 3 + apache-nutch-2.3/.project | 19 + .../.settings/org.apache.ivyde.eclipse.prefs | 2 + .../org.eclipse.core.resources.prefs | 2 + apache-nutch-2.3/CHANGES.txt | 2090 ++++++ apache-nutch-2.3/LICENSE.txt | 5793 +++++++++++++++++ apache-nutch-2.3/NOTICE.txt | 15 + apache-nutch-2.3/README.txt | 36 + apache-nutch-2.3/build.xml | 1043 +++ apache-nutch-2.3/conf/automaton-urlfilter.txt | 35 + apache-nutch-2.3/conf/configuration.xsl | 40 + apache-nutch-2.3/conf/domain-suffixes.xml | 4428 +++++++++++++ apache-nutch-2.3/conf/domain-suffixes.xsd | 130 + apache-nutch-2.3/conf/domain-urlfilter.txt | 16 + apache-nutch-2.3/conf/elasticsearch.conf | 17 + .../conf/gora-accumulo-mapping.xml | 85 + .../conf/gora-cassandra-mapping.xml | 92 + apache-nutch-2.3/conf/gora-hbase-mapping.xml | 98 + .../conf/gora-mongodb-mapping.xml | 61 + .../conf/gora-solr-host-schema.xml | 331 + apache-nutch-2.3/conf/gora-solr-mapping.xml | 61 + .../conf/gora-solr-webpage-schema.xml | 359 + apache-nutch-2.3/conf/gora-sql-mapping.xml | 60 + apache-nutch-2.3/conf/gora.properties | 111 + apache-nutch-2.3/conf/hbase-site.xml | 35 + apache-nutch-2.3/conf/htmlunit-urlfilter.txt | 70 + apache-nutch-2.3/conf/httpclient-auth.xml | 61 + apache-nutch-2.3/conf/log4j.properties | 112 + apache-nutch-2.3/conf/nutch-conf.xsl | 24 + apache-nutch-2.3/conf/nutch-default.xml | 1356 ++++ apache-nutch-2.3/conf/nutch-site.xml | 190 + apache-nutch-2.3/conf/parse-plugins.dtd | 12 + apache-nutch-2.3/conf/parse-plugins.xml | 98 + apache-nutch-2.3/conf/prefix-urlfilter.txt | 21 + apache-nutch-2.3/conf/regex-normalize.xml | 94 + apache-nutch-2.3/conf/regex-urlfilter.txt | 94 + apache-nutch-2.3/conf/schema.xml | 375 ++ apache-nutch-2.3/conf/solrindex-mapping.xml | 43 + apache-nutch-2.3/conf/subcollections.xml | 28 + apache-nutch-2.3/conf/suffix-urlfilter.txt | 116 + apache-nutch-2.3/default.properties | 173 + apache-nutch-2.3/src/bin/crawl | 184 + apache-nutch-2.3/src/bin/nutch | 266 + apache-nutch-2.3/src/gora/host.avsc | 41 + apache-nutch-2.3/src/gora/webpage.avsc | 285 + .../org/apache/nutch/api/ConfManager.java | 38 + .../java/org/apache/nutch/api/JobManager.java | 40 + .../org/apache/nutch/api/NutchServer.java | 286 + .../org/apache/nutch/api/impl/JobFactory.java | 72 + .../org/apache/nutch/api/impl/JobWorker.java | 103 + .../api/impl/NutchServerPoolExecutor.java | 112 + .../apache/nutch/api/impl/RAMConfManager.java | 119 + .../apache/nutch/api/impl/RAMJobManager.java | 99 + .../apache/nutch/api/impl/db/DbIterator.java | 112 + .../nutch/api/impl/db/DbPageConverter.java | 135 + .../apache/nutch/api/impl/db/DbReader.java | 91 + .../apache/nutch/api/impl/package-info.java | 22 + .../nutch/api/misc/ErrorStatusService.java | 40 + .../nutch/api/model/request/DbFilter.java | 67 + .../nutch/api/model/request/JobConfig.java | 69 + .../nutch/api/model/request/NutchConfig.java | 51 + .../nutch/api/model/request/SeedList.java | 79 + .../nutch/api/model/request/SeedUrl.java | 61 + .../api/model/response/DbQueryResult.java | 35 + .../api/model/response/ErrorResponse.java | 47 + .../nutch/api/model/response/JobInfo.java | 113 + .../nutch/api/model/response/NutchStatus.java | 82 + .../org/apache/nutch/api/package-info.java | 22 + .../nutch/api/resources/AbstractResource.java | 48 + .../nutch/api/resources/AdminResource.java | 83 + .../nutch/api/resources/ConfigResource.java | 87 + .../nutch/api/resources/DbResource.java | 61 + .../nutch/api/resources/JobResource.java | 72 + .../nutch/api/resources/SeedResource.java | 109 + .../nutch/crawl/AbstractFetchSchedule.java | 224 + .../nutch/crawl/AdaptiveFetchSchedule.java | 122 + .../org/apache/nutch/crawl/CrawlStatus.java | 54 + .../apache/nutch/crawl/DbUpdateMapper.java | 114 + .../apache/nutch/crawl/DbUpdateReducer.java | 218 + .../org/apache/nutch/crawl/DbUpdaterJob.java | 184 + .../nutch/crawl/DefaultFetchSchedule.java | 42 + .../org/apache/nutch/crawl/FetchSchedule.java | 167 + .../nutch/crawl/FetchScheduleFactory.java | 53 + .../org/apache/nutch/crawl/GeneratorJob.java | 321 + .../apache/nutch/crawl/GeneratorMapper.java | 125 + .../apache/nutch/crawl/GeneratorReducer.java | 114 + .../org/apache/nutch/crawl/InjectorJob.java | 288 + .../org/apache/nutch/crawl/MD5Signature.java | 73 + .../org/apache/nutch/crawl/NutchWritable.java | 45 + .../org/apache/nutch/crawl/Signature.java | 30 + .../nutch/crawl/SignatureComparator.java | 59 + .../apache/nutch/crawl/SignatureFactory.java | 66 + .../apache/nutch/crawl/TextMD5Signature.java | 58 + .../nutch/crawl/TextProfileSignature.java | 177 + .../apache/nutch/crawl/URLPartitioner.java | 165 + .../org/apache/nutch/crawl/URLWebPage.java | 48 + .../org/apache/nutch/crawl/UrlWithScore.java | 195 + .../apache/nutch/crawl/WebTableReader.java | 651 ++ .../java/org/apache/nutch/crawl/package.html | 5 + .../org/apache/nutch/fetcher/FetchEntry.java | 70 + .../org/apache/nutch/fetcher/FetcherJob.java | 328 + .../apache/nutch/fetcher/FetcherReducer.java | 935 +++ .../org/apache/nutch/fetcher/package.html | 5 + .../java/org/apache/nutch/host/HostDb.java | 141 + .../org/apache/nutch/host/HostDbReader.java | 93 + .../apache/nutch/host/HostDbUpdateJob.java | 140 + .../nutch/host/HostDbUpdateReducer.java | 97 + .../apache/nutch/host/HostInjectorJob.java | 181 + .../org/apache/nutch/host/package-info.java | 22 + .../org/apache/nutch/indexer/CleaningJob.java | 183 + .../nutch/indexer/IndexCleaningFilter.java | 42 + .../nutch/indexer/IndexCleaningFilters.java | 125 + .../org/apache/nutch/indexer/IndexUtil.java | 107 + .../org/apache/nutch/indexer/IndexWriter.java | 47 + .../apache/nutch/indexer/IndexWriters.java | 144 + .../nutch/indexer/IndexerOutputFormat.java | 92 + .../nutch/indexer/IndexingException.java | 39 + .../apache/nutch/indexer/IndexingFilter.java | 50 + .../apache/nutch/indexer/IndexingFilters.java | 132 + .../nutch/indexer/IndexingFiltersChecker.java | 172 + .../org/apache/nutch/indexer/IndexingJob.java | 215 + .../apache/nutch/indexer/NutchDocument.java | 159 + .../org/apache/nutch/indexer/package.html | 10 + .../nutch/indexer/solr/SolrConstants.java | 46 + .../indexer/solr/SolrDeleteDuplicates.java | 407 ++ .../apache/nutch/indexer/solr/SolrUtils.java | 69 + .../nutch/metadata/CreativeCommons.java | 35 + .../org/apache/nutch/metadata/DublinCore.java | 161 + .../java/org/apache/nutch/metadata/Feed.java | 38 + .../apache/nutch/metadata/HttpHeaders.java | 49 + .../apache/nutch/metadata/MetaWrapper.java | 120 + .../org/apache/nutch/metadata/Metadata.java | 259 + .../java/org/apache/nutch/metadata/Nutch.java | 127 + .../nutch/metadata/SpellCheckedMetadata.java | 150 + .../org/apache/nutch/metadata/package.html | 6 + .../java/org/apache/nutch/net/URLFilter.java | 40 + .../apache/nutch/net/URLFilterChecker.java | 133 + .../apache/nutch/net/URLFilterException.java | 39 + .../java/org/apache/nutch/net/URLFilters.java | 96 + .../org/apache/nutch/net/URLNormalizer.java | 37 + .../nutch/net/URLNormalizerChecker.java | 117 + .../org/apache/nutch/net/URLNormalizers.java | 322 + .../org/apache/nutch/net/package-info.java | 23 + .../nutch/net/protocols/HttpDateFormat.java | 124 + .../net/protocols/ProtocolException.java | 47 + .../apache/nutch/net/protocols/Response.java | 46 + .../nutch/net/protocols/package-info.java | 23 + .../org/apache/nutch/parse/HTMLMetaTags.java | 202 + .../java/org/apache/nutch/parse/Outlink.java | 83 + .../apache/nutch/parse/OutlinkExtractor.java | 254 + .../java/org/apache/nutch/parse/Parse.java | 68 + .../org/apache/nutch/parse/ParseCallable.java | 38 + .../apache/nutch/parse/ParseException.java | 39 + .../org/apache/nutch/parse/ParseFilter.java | 40 + .../org/apache/nutch/parse/ParseFilters.java | 127 + .../apache/nutch/parse/ParsePluginList.java | 71 + .../nutch/parse/ParsePluginsReader.java | 278 + .../apache/nutch/parse/ParseStatusCodes.java | 73 + .../apache/nutch/parse/ParseStatusUtils.java | 121 + .../org/apache/nutch/parse/ParseUtil.java | 299 + .../java/org/apache/nutch/parse/Parser.java | 44 + .../org/apache/nutch/parse/ParserChecker.java | 220 + .../org/apache/nutch/parse/ParserFactory.java | 438 ++ .../org/apache/nutch/parse/ParserJob.java | 348 + .../apache/nutch/parse/ParserNotFound.java | 47 + .../org/apache/nutch/parse/package-info.java | 22 + .../plugin/CircularDependencyException.java | 36 + .../org/apache/nutch/plugin/Extension.java | 194 + .../apache/nutch/plugin/ExtensionPoint.java | 123 + .../apache/nutch/plugin/FieldPluggable.java | 26 + .../plugin/MissingDependencyException.java | 36 + .../org/apache/nutch/plugin/Pluggable.java | 30 + .../java/org/apache/nutch/plugin/Plugin.java | 95 + .../nutch/plugin/PluginClassLoader.java | 80 + .../apache/nutch/plugin/PluginDescriptor.java | 354 + .../nutch/plugin/PluginManifestParser.java | 303 + .../apache/nutch/plugin/PluginRepository.java | 445 ++ .../nutch/plugin/PluginRuntimeException.java | 37 + .../java/org/apache/nutch/plugin/package.html | 40 + .../org/apache/nutch/protocol/Content.java | 315 + .../org/apache/nutch/protocol/Protocol.java | 65 + .../nutch/protocol/ProtocolException.java | 39 + .../nutch/protocol/ProtocolFactory.java | 139 + .../nutch/protocol/ProtocolNotFound.java | 36 + .../apache/nutch/protocol/ProtocolOutput.java | 56 + .../nutch/protocol/ProtocolStatusCodes.java | 60 + .../nutch/protocol/ProtocolStatusUtils.java | 125 + .../org/apache/nutch/protocol/RobotRules.java | 43 + .../nutch/protocol/RobotRulesParser.java | 190 + .../apache/nutch/protocol/package-info.java | 23 + .../org/apache/nutch/scoring/ScoreDatum.java | 121 + .../apache/nutch/scoring/ScoringFilter.java | 139 + .../nutch/scoring/ScoringFilterException.java | 43 + .../apache/nutch/scoring/ScoringFilters.java | 156 + .../apache/nutch/scoring/package-info.java | 22 + .../java/org/apache/nutch/storage/Host.java | 587 ++ .../java/org/apache/nutch/storage/Mark.java | 64 + .../org/apache/nutch/storage/ParseStatus.java | 616 ++ .../apache/nutch/storage/ProtocolStatus.java | 647 ++ .../apache/nutch/storage/StorageUtils.java | 175 + .../org/apache/nutch/storage/WebPage.java | 2879 ++++++++ .../apache/nutch/storage/WebTableCreator.java | 29 + .../apache/nutch/storage/package-info.java | 23 + .../org/apache/nutch/tools/Benchmark.java | 266 + .../org/apache/nutch/tools/DmozParser.java | 430 ++ .../org/apache/nutch/tools/ResolveUrls.java | 206 + .../nutch/tools/arc/ArcInputFormat.java | 51 + .../nutch/tools/arc/ArcRecordReader.java | 300 + .../apache/nutch/tools/arc/package-info.java | 23 + .../org/apache/nutch/tools/package-info.java | 22 + .../tools/proxy/AbstractTestbedHandler.java | 65 + .../nutch/tools/proxy/DelayHandler.java | 72 + .../apache/nutch/tools/proxy/FakeHandler.java | 178 + .../nutch/tools/proxy/LogDebugHandler.java | 80 + .../nutch/tools/proxy/NotFoundHandler.java | 56 + .../nutch/tools/proxy/TestbedProxy.java | 184 + .../nutch/tools/proxy/package-info.java | 22 + .../src/java/org/apache/nutch/util/Bytes.java | 1453 +++++ .../org/apache/nutch/util/CommandRunner.java | 291 + .../org/apache/nutch/util/DeflateUtils.java | 140 + .../java/org/apache/nutch/util/DomUtil.java | 105 + .../apache/nutch/util/EncodingDetector.java | 393 ++ .../java/org/apache/nutch/util/FSUtils.java | 106 + .../java/org/apache/nutch/util/GZIPUtils.java | 148 + .../util/GenericWritableConfigurable.java | 60 + .../org/apache/nutch/util/HadoopFSUtil.java | 72 + .../java/org/apache/nutch/util/Histogram.java | 129 + .../nutch/util/IdentityPageReducer.java | 35 + .../java/org/apache/nutch/util/LockUtil.java | 84 + .../java/org/apache/nutch/util/MimeUtil.java | 278 + .../org/apache/nutch/util/NodeWalker.java | 130 + .../apache/nutch/util/NutchConfiguration.java | 104 + .../java/org/apache/nutch/util/NutchJob.java | 71 + .../org/apache/nutch/util/NutchJobConf.java | 30 + .../java/org/apache/nutch/util/NutchTool.java | 99 + .../org/apache/nutch/util/ObjectCache.java | 56 + .../src/java/org/apache/nutch/util/Pair.java | 37 + .../nutch/util/PrefixStringMatcher.java | 119 + .../org/apache/nutch/util/StringUtil.java | 211 + .../nutch/util/SuffixStringMatcher.java | 114 + .../java/org/apache/nutch/util/TableUtil.java | 162 + .../org/apache/nutch/util/TimingUtil.java | 59 + .../java/org/apache/nutch/util/ToolUtil.java | 81 + .../apache/nutch/util/TrieStringMatcher.java | 202 + .../java/org/apache/nutch/util/URLUtil.java | 481 ++ .../apache/nutch/util/WebPageWritable.java | 60 + .../nutch/util/domain/DomainStatistics.java | 236 + .../nutch/util/domain/DomainSuffix.java | 80 + .../nutch/util/domain/DomainSuffixes.java | 86 + .../util/domain/DomainSuffixesReader.java | 164 + .../nutch/util/domain/TopLevelDomain.java | 64 + .../org/apache/nutch/util/domain/package.html | 14 + .../org/apache/nutch/util/package-info.java | 22 + .../nutch/webui/NutchUiApplication.java | 75 + .../nutch/webui/NutchUiApplication.properties | 63 + .../org/apache/nutch/webui/NutchUiServer.java | 104 + .../nutch/webui/client/NutchClient.java | 49 + .../webui/client/NutchClientFactory.java | 52 + .../webui/client/impl/CrawlingCycle.java | 82 + .../client/impl/CrawlingCycleListener.java | 31 + .../webui/client/impl/NutchClientImpl.java | 98 + .../webui/client/impl/RemoteCommand.java | 76 + .../client/impl/RemoteCommandBuilder.java | 64 + .../client/impl/RemoteCommandExecutor.java | 110 + .../impl/RemoteCommandsBatchFactory.java | 97 + .../webui/client/model/ConnectionStatus.java | 21 + .../nutch/webui/client/model/Crawl.java | 126 + .../nutch/webui/client/model/JobConfig.java | 77 + .../nutch/webui/client/model/JobInfo.java | 104 + .../nutch/webui/client/model/NutchStatus.java | 62 + .../nutch/webui/config/CustomDaoFactory.java | 58 + .../webui/config/CustomTableCreator.java | 83 + .../webui/config/NutchGuiConfiguration.java | 33 + .../webui/config/SpringConfiguration.java | 91 + .../apache/nutch/webui/model/NutchConfig.java | 22 + .../nutch/webui/model/NutchInstance.java | 118 + .../apache/nutch/webui/model/SeedList.java | 104 + .../org/apache/nutch/webui/model/SeedUrl.java | 95 + .../nutch/webui/pages/AbstractBasePage.html | 33 + .../nutch/webui/pages/AbstractBasePage.java | 181 + .../nutch/webui/pages/DashboardPage.html | 52 + .../nutch/webui/pages/DashboardPage.java | 56 + .../apache/nutch/webui/pages/LogOutPage.java | 21 + .../nutch/webui/pages/SchedulingPage.java | 21 + .../apache/nutch/webui/pages/SearchPage.java | 21 + .../nutch/webui/pages/StatisticsPage.java | 21 + .../nutch/webui/pages/UrlsUploadPage.java | 21 + .../nutch/webui/pages/UserSettingsPage.java | 21 + .../pages/assets/NutchUiCssReference.java | 39 + .../nutch/webui/pages/assets/nutch-style.css | 149 + .../pages/components/ColorEnumLabel.java | 71 + .../components/ColorEnumLabelBuilder.java | 49 + .../pages/components/CpmIteratorAdapter.java | 41 + .../nutch/webui/pages/crawls/CrawlPanel.html | 58 + .../nutch/webui/pages/crawls/CrawlPanel.java | 98 + .../nutch/webui/pages/crawls/CrawlsPage.html | 90 + .../nutch/webui/pages/crawls/CrawlsPage.java | 139 + .../webui/pages/instances/InstancePanel.html | 46 + .../webui/pages/instances/InstancePanel.java | 62 + .../webui/pages/instances/InstancesPage.html | 66 + .../webui/pages/instances/InstancesPage.java | 111 + .../nutch/webui/pages/menu/VerticalMenu.html | 48 + .../nutch/webui/pages/menu/VerticalMenu.java | 28 + .../nutch/webui/pages/seed/SeedListsPage.html | 75 + .../nutch/webui/pages/seed/SeedListsPage.java | 79 + .../nutch/webui/pages/seed/SeedPage.html | 91 + .../nutch/webui/pages/seed/SeedPage.java | 153 + .../webui/pages/settings/SettingsPage.html | 43 + .../webui/pages/settings/SettingsPage.java | 59 + .../nutch/webui/service/CrawlService.java | 33 + .../webui/service/NutchInstanceService.java | 33 + .../nutch/webui/service/NutchService.java | 31 + .../nutch/webui/service/SeedListService.java | 33 + .../webui/service/impl/CrawlServiceImpl.java | 129 + .../impl/NutchInstanceServiceImpl.java | 76 + .../webui/service/impl/NutchServiceImpl.java | 82 + .../service/impl/SeedListServiceImpl.java | 77 + apache-nutch-2.3/src/java/overview.html | 13 + apache-nutch-2.3/src/plugin/build-plugin.xml | 239 + apache-nutch-2.3/src/plugin/build.xml | 174 + .../src/plugin/creativecommons/README.txt | 1 + .../src/plugin/creativecommons/build.xml | 28 + .../creativecommons/conf/crawl-urlfilter.txt | 18 + .../creativecommons/conf/nutch-site.xml | 50 + .../plugin/creativecommons/data/anchor.html | 9 + .../src/plugin/creativecommons/data/rdf.html | 35 + .../src/plugin/creativecommons/data/rel.html | 6 + .../src/plugin/creativecommons/ivy.xml | 41 + .../src/plugin/creativecommons/plugin.xml | 48 + .../nutch/CCIndexingFilter.java | 138 + .../creativecommons/nutch/CCParseFilter.java | 309 + .../org/creativecommons/nutch/package.html | 5 + .../nutch/TestCCParseFilter.java | 85 + apache-nutch-2.3/src/plugin/feed/build.xml | 45 + apache-nutch-2.3/src/plugin/feed/ivy.xml | 43 + apache-nutch-2.3/src/plugin/feed/plugin.xml | 48 + .../src/plugin/feed/sample/rsstest.rss | 36 + .../indexer/feed/FeedIndexingFilter.java | 131 + .../nutch/indexer/feed/package-info.java | 21 + .../apache/nutch/parse/feed/FeedParser.java | 372 ++ .../apache/nutch/parse/feed/package-info.java | 21 + .../nutch/parse/feed/TestFeedParser.java | 133 + .../src/plugin/index-anchor/build.xml | 22 + .../src/plugin/index-anchor/ivy.xml | 41 + .../src/plugin/index-anchor/plugin.xml | 38 + .../indexer/anchor/AnchorIndexingFilter.java | 127 + .../apache/nutch/indexer/anchor/package.html | 5 + .../anchor/TestAnchorIndexingFilter.java | 57 + .../src/plugin/index-basic/build.xml | 22 + .../src/plugin/index-basic/ivy.xml | 41 + .../src/plugin/index-basic/plugin.xml | 42 + .../indexer/basic/BasicIndexingFilter.java | 165 + .../apache/nutch/indexer/basic/package.html | 5 + .../basic/TestBasicIndexingFilter.java | 97 + .../src/plugin/index-metadata/build.xml | 22 + .../src/plugin/index-metadata/ivy.xml | 41 + .../src/plugin/index-metadata/plugin.xml | 42 + .../indexer/metadata/MetadataIndexer.java | 96 + .../nutch/indexer/metadata/package-info.java | 23 + .../src/plugin/index-more/build.xml | 22 + .../src/plugin/index-more/ivy.xml | 41 + .../src/plugin/index-more/plugin.xml | 52 + .../indexer/more/MoreIndexingFilter.java | 278 + .../apache/nutch/indexer/more/package.html | 6 + .../indexer/more/TestMoreIndexingFilter.java | 90 + .../src/plugin/index-s2jh/build.xml | 40 + .../src/plugin/index-s2jh/ivy.xml | 38 + .../src/plugin/index-s2jh/plugin.xml | 42 + .../indexer/s2jh/AbstractIndexingFilter.java | 110 + .../s2jh/S2jhDiscardIndexingFilter.java | 33 + .../indexer/s2jh/S2jhIndexingFilter.java | 45 + .../src/plugin/indexer-elastic/build-ivy.xml | 54 + .../src/plugin/indexer-elastic/build.xml | 22 + .../indexer-elastic/howto_upgrade_es.txt | 6 + .../src/plugin/indexer-elastic/ivy.xml | 35 + .../src/plugin/indexer-elastic/plugin.xml | 57 + .../indexwriter/elastic/ElasticConstants.java | 28 + .../elastic/ElasticIndexWriter.java | 274 + .../indexwriter/elastic/package-info.java | 22 + .../src/plugin/indexer-solr/build.xml | 22 + .../src/plugin/indexer-solr/ivy.xml | 43 + .../src/plugin/indexer-solr/plugin.xml | 54 + .../nutch/indexwriter/solr/SolrConstants.java | 46 + .../indexwriter/solr/SolrIndexWriter.java | 181 + .../indexwriter/solr/SolrMappingReader.java | 147 + .../nutch/indexwriter/solr/SolrUtils.java | 68 + .../nutch/indexwriter/solr/package-info.java | 22 + .../src/plugin/language-identifier/build.xml | 38 + .../src/plugin/language-identifier/ivy.xml | 41 + .../src/plugin/language-identifier/plugin.xml | 49 + .../analysis/lang/HTMLLanguageParser.java | 328 + .../analysis/lang/LanguageIndexingFilter.java | 92 + .../analysis/lang/langmappings.properties | 188 + .../apache/nutch/analysis/lang/package.html | 6 + .../analysis/lang/TestHTMLLanguageParser.java | 158 + .../org/apache/nutch/analysis/lang/da.test | 108 + .../org/apache/nutch/analysis/lang/de.test | 104 + .../org/apache/nutch/analysis/lang/el.test | 109 + .../org/apache/nutch/analysis/lang/en.test | 105 + .../org/apache/nutch/analysis/lang/es.test | 107 + .../org/apache/nutch/analysis/lang/fi.test | 106 + .../org/apache/nutch/analysis/lang/fr.test | 105 + .../org/apache/nutch/analysis/lang/it.test | 109 + .../org/apache/nutch/analysis/lang/nl.test | 105 + .../org/apache/nutch/analysis/lang/pt.test | 105 + .../org/apache/nutch/analysis/lang/sv.test | 108 + .../nutch/analysis/lang/test-referencial.txt | 10 + .../src/plugin/lib-htmlunit/build.xml | 25 + .../src/plugin/lib-htmlunit/ivy.xml | 42 + .../src/plugin/lib-htmlunit/plugin.xml | 38 + .../protocol/htmlunit/ExtHtmlunitCache.java | 25 + .../protocol/htmlunit/HttpWebClient.java | 74 + .../htmlunit/RegexHttpWebConnection.java | 158 + .../src/plugin/lib-http/build.xml | 22 + apache-nutch-2.3/src/plugin/lib-http/ivy.xml | 41 + .../src/plugin/lib-http/plugin.xml | 33 + .../protocol/http/api/BlockedException.java | 27 + .../nutch/protocol/http/api/HttpBase.java | 476 ++ .../protocol/http/api/HttpException.java | 40 + .../http/api/HttpRobotRulesParser.java | 146 + .../nutch/protocol/http/api/package.html | 6 + .../http/api/TestRobotRulesParser.java | 126 + .../src/plugin/lib-nekohtml/build.xml | 30 + .../src/plugin/lib-nekohtml/ivy.xml | 42 + .../src/plugin/lib-nekohtml/plugin.xml | 38 + .../src/plugin/lib-pinyin/build.xml | 25 + .../src/plugin/lib-pinyin/ivy.xml | 41 + .../src/plugin/lib-pinyin/plugin.xml | 35 + .../solr/pinyin/ChineseToPinyinConvertor.java | 189 + .../ChineseToPinyinTransformer.java | 77 + .../src/plugin/lib-regex-filter/build.xml | 22 + .../src/plugin/lib-regex-filter/ivy.xml | 41 + .../src/plugin/lib-regex-filter/plugin.xml | 33 + .../apache/nutch/urlfilter/api/RegexRule.java | 64 + .../urlfilter/api/RegexURLFilterBase.java | 261 + .../nutch/urlfilter/api/package-info.java | 23 + .../urlfilter/api/RegexURLFilterBaseTest.java | 142 + apache-nutch-2.3/src/plugin/lib-xml/build.xml | 36 + apache-nutch-2.3/src/plugin/lib-xml/ivy.xml | 44 + .../src/plugin/lib-xml/plugin.xml | 65 + .../src/plugin/microformats-reltag/build.xml | 36 + .../src/plugin/microformats-reltag/ivy.xml | 41 + .../src/plugin/microformats-reltag/plugin.xml | 49 + .../sample/microformats_reltag_test.html | 470 ++ .../reltag/RelTagIndexingFilter.java | 101 + .../microformats/reltag/RelTagParser.java | 178 + .../nutch/microformats/reltag/package.html | 8 + .../reltag/TestRelTagIndexingFilter.java | 59 + .../microformats/reltag/TestRelTagParser.java | 99 + .../plugin/nutch-extensionpoints/build.xml | 30 + .../src/plugin/nutch-extensionpoints/ivy.xml | 41 + .../plugin/nutch-extensionpoints/plugin.xml | 62 + .../src/plugin/parse-ext/build.xml | 32 + apache-nutch-2.3/src/plugin/parse-ext/command | 24 + apache-nutch-2.3/src/plugin/parse-ext/ivy.xml | 41 + .../src/plugin/parse-ext/plugin.xml | 60 + .../org/apache/nutch/parse/ext/ExtParser.java | 177 + .../apache/nutch/parse/ext/package-info.java | 21 + .../apache/nutch/parse/ext/TestExtParser.java | 126 + .../src/plugin/parse-html/build.xml | 40 + .../src/plugin/parse-html/ivy.xml | 42 + .../plugin/parse-html/lib/tagsoup.LICENSE.txt | 201 + .../src/plugin/parse-html/plugin.xml | 48 + .../apache/nutch/parse/html/DOMBuilder.java | 766 +++ .../nutch/parse/html/DOMContentUtils.java | 366 ++ .../nutch/parse/html/HTMLMetaProcessor.java | 214 + .../apache/nutch/parse/html/HtmlParser.java | 388 ++ .../parse/html/XMLCharacterRecognizer.java | 112 + .../org/apache/nutch/parse/html/package.html | 5 + .../nutch/parse/html/TestDOMContentUtils.java | 339 + .../nutch/parse/html/TestHtmlParser.java | 138 + .../parse/html/TestRobotsMetaProcessor.java | 155 + .../src/plugin/parse-js/build.xml | 36 + apache-nutch-2.3/src/plugin/parse-js/ivy.xml | 41 + .../src/plugin/parse-js/plugin.xml | 53 + .../sample/parse_embedded_js_test.html | 316 + .../apache/nutch/parse/js/JSParseFilter.java | 348 + .../apache/nutch/parse/js/package-info.java | 23 + .../nutch/parse/js/TestJSParseFilter.java | 108 + .../src/plugin/parse-metatags/README.txt | 37 + .../src/plugin/parse-metatags/build.xml | 53 + .../src/plugin/parse-metatags/ivy.xml | 41 + .../src/plugin/parse-metatags/plugin.xml | 22 + .../parse-metatags/sample/testMetatags.html | 11 + .../sample/testMultivalueMetatags.html | 13 + .../nutch/parse/metatags/MetaTagsParser.java | 139 + .../nutch/parse/metatags/package-info.java | 24 + .../parse/metatags/TestMetaTagsParser.java | 218 + .../src/plugin/parse-s2jh/build.xml | 46 + .../src/plugin/parse-s2jh/ivy.xml | 43 + .../src/plugin/parse-s2jh/plugin.xml | 73 + .../parse/s2jh/AbstractHtmlParseFilter.java | 796 +++ .../apache/nutch/parse/s2jh/CrawlData.java | 252 + .../parse/s2jh/HuanqiuHtmlParseFilter.java | 57 + .../parse/s2jh/IndeedHtmlParseFilter.java | 152 + .../parse/s2jh/JtCpdHtmlParseFilter.java | 88 + .../parse/s2jh/JumeiHtmlParseFilter.java | 114 + .../org/apache/nutch/parse/s2jh/Position.java | 45 + .../parse/s2jh/ScjtaqHtmlParseFilter.java | 92 + .../parse/s2jh/TMallHtmlParseFilter.java | 59 + .../nutch/parse/s2jh/VipHtmlParseFilter.java | 61 + .../src/plugin/parse-swf/build.xml | 38 + apache-nutch-2.3/src/plugin/parse-swf/ivy.xml | 41 + .../plugin/parse-swf/lib/javaswf-LICENSE.txt | 33 + .../src/plugin/parse-swf/lib/javaswf.jar | Bin 0 -> 125369 bytes .../src/plugin/parse-swf/plugin.xml | 44 + .../src/plugin/parse-swf/sample/test1.swf | Bin 0 -> 21054 bytes .../src/plugin/parse-swf/sample/test1.txt | 60 + .../src/plugin/parse-swf/sample/test2.swf | Bin 0 -> 42534 bytes .../src/plugin/parse-swf/sample/test2.txt | 5 + .../src/plugin/parse-swf/sample/test3.swf | Bin 0 -> 51562 bytes .../src/plugin/parse-swf/sample/test3.txt | 11 + .../org/apache/nutch/parse/swf/SWFParser.java | 708 ++ .../apache/nutch/parse/swf/package-info.java | 21 + .../apache/nutch/parse/swf/TestSWFParser.java | 97 + .../src/plugin/parse-tika/build-ivy.xml | 54 + .../src/plugin/parse-tika/build.xml | 42 + .../plugin/parse-tika/howto_upgrade_tika.txt | 8 + .../src/plugin/parse-tika/ivy.xml | 44 + .../src/plugin/parse-tika/plugin.xml | 88 + .../plugin/parse-tika/sample/encrypted.pdf | Bin 0 -> 3431 bytes .../src/plugin/parse-tika/sample/nutch.html | 519 ++ .../parse-tika/sample/nutch_logo_tm.gif | Bin 0 -> 2747 bytes .../src/plugin/parse-tika/sample/ootest.odt | Bin 0 -> 20753 bytes .../src/plugin/parse-tika/sample/ootest.sxw | Bin 0 -> 20125 bytes .../src/plugin/parse-tika/sample/ootest.txt | 30 + .../src/plugin/parse-tika/sample/pdftest.pdf | Bin 0 -> 2995 bytes .../src/plugin/parse-tika/sample/rsstest.rss | 37 + .../src/plugin/parse-tika/sample/test.rtf | 17 + .../src/plugin/parse-tika/sample/word97.doc | Bin 0 -> 8192 bytes .../apache/nutch/parse/tika/DOMBuilder.java | 766 +++ .../nutch/parse/tika/DOMContentUtils.java | 367 ++ .../nutch/parse/tika/HTMLMetaProcessor.java | 214 + .../apache/nutch/parse/tika/TikaConfig.java | 241 + .../apache/nutch/parse/tika/TikaParser.java | 278 + .../parse/tika/XMLCharacterRecognizer.java | 112 + .../apache/nutch/parse/tika/package-info.java | 23 + .../nutch/parse/tika/DOMContentUtilsTest.java | 387 ++ .../nutch/parse/tika/TestImageMetadata.java | 86 + .../nutch/parse/tika/TestMSWordParser.java | 106 + .../apache/nutch/parse/tika/TestOOParser.java | 114 + .../nutch/parse/tika/TestPdfParser.java | 85 + .../nutch/parse/tika/TestRSSParser.java | 124 + .../nutch/parse/tika/TestRTFParser.java | 92 + .../src/plugin/parse-zip/build.xml | 38 + apache-nutch-2.3/src/plugin/parse-zip/ivy.xml | 41 + .../src/plugin/parse-zip/plugin.xml | 46 + .../src/plugin/parse-zip/sample/test.zip | Bin 0 -> 182 bytes .../org/apache/nutch/parse/zip/ZipParser.java | 112 + .../nutch/parse/zip/ZipTextExtractor.java | 122 + .../apache/nutch/parse/zip/package-info.java | 21 + .../apache/nutch/parse/zip/TestZipParser.java | 78 + apache-nutch-2.3/src/plugin/plugin.dtd | 206 + .../src/plugin/protocol-file/build.xml | 30 + .../src/plugin/protocol-file/ivy.xml | 41 + .../src/plugin/protocol-file/plugin.xml | 46 + .../protocol-file/sample/testprotocolfile.txt | 2 + .../sample/testprotocolfile_(encoded).txt | 1 + .../org/apache/nutch/protocol/file/File.java | 234 + .../apache/nutch/protocol/file/FileError.java | 36 + .../nutch/protocol/file/FileException.java | 40 + .../nutch/protocol/file/FileResponse.java | 281 + .../apache/nutch/protocol/file/package.html | 5 + .../nutch/protocol/file/TestProtocolFile.java | 99 + .../src/plugin/protocol-ftp/build.xml | 22 + .../src/plugin/protocol-ftp/ivy.xml | 42 + .../src/plugin/protocol-ftp/plugin.xml | 46 + .../org/apache/nutch/protocol/ftp/Client.java | 586 ++ .../org/apache/nutch/protocol/ftp/Ftp.java | 262 + .../apache/nutch/protocol/ftp/FtpError.java | 36 + .../nutch/protocol/ftp/FtpException.java | 46 + .../ftp/FtpExceptionBadSystResponse.java | 29 + .../FtpExceptionCanNotHaveDataConnection.java | 29 + ...ceptionControlClosedByForcedDataClose.java | 30 + .../FtpExceptionUnknownForcedDataClose.java | 30 + .../nutch/protocol/ftp/FtpResponse.java | 527 ++ .../protocol/ftp/FtpRobotRulesParser.java | 105 + .../protocol/ftp/PrintCommandListener.java | 71 + .../apache/nutch/protocol/ftp/package.html | 5 + .../src/plugin/protocol-http/build.xml | 45 + .../src/plugin/protocol-http/ivy.xml | 41 + .../plugin/protocol-http/jsp/basic-http.jsp | 44 + .../plugin/protocol-http/jsp/brokenpage.jsp | 47 + .../plugin/protocol-http/jsp/redirect301.jsp | 49 + .../plugin/protocol-http/jsp/redirect302.jsp | 49 + .../src/plugin/protocol-http/plugin.xml | 50 + .../org/apache/nutch/protocol/http/Http.java | 75 + .../nutch/protocol/http/HttpResponse.java | 512 ++ .../apache/nutch/protocol/http/package.html | 5 + .../src/test/conf/nutch-site-test.xml | 52 + .../nutch/protocol/http/TestProtocolHttp.java | 137 + .../src/plugin/protocol-httpclient/build.xml | 45 + .../src/plugin/protocol-httpclient/ivy.xml | 41 + .../plugin/protocol-httpclient/jsp/basic.jsp | 77 + .../protocol-httpclient/jsp/cookies.jsp | 65 + .../plugin/protocol-httpclient/jsp/digest.jsp | 71 + .../plugin/protocol-httpclient/jsp/noauth.jsp | 38 + .../plugin/protocol-httpclient/jsp/ntlm.jsp | 92 + .../src/plugin/protocol-httpclient/plugin.xml | 57 + .../DummySSLProtocolSocketFactory.java | 163 + .../httpclient/DummyX509TrustManager.java | 86 + .../nutch/protocol/httpclient/Http.java | 449 ++ .../httpclient/HttpAuthentication.java | 45 + .../HttpAuthenticationException.java | 71 + .../httpclient/HttpAuthenticationFactory.java | 142 + .../httpclient/HttpBasicAuthentication.java | 197 + .../protocol/httpclient/HttpResponse.java | 221 + .../nutch/protocol/httpclient/package.html | 9 + .../src/test/conf/httpclient-auth-test.xml | 57 + .../src/test/conf/nutch-site-test.xml | 52 + .../httpclient/TestProtocolHttpClient.java | 231 + .../src/plugin/protocol-s2jh/build.xml | 38 + .../src/plugin/protocol-s2jh/ivy.xml | 41 + .../src/plugin/protocol-s2jh/plugin.xml | 48 + .../org/apache/nutch/protocol/s2jh/Http.java | 91 + .../nutch/protocol/s2jh/HttpResponse.java | 641 ++ .../apache/nutch/protocol/s2jh/package.html | 5 + .../src/plugin/protocol-sftp/build.xml | 21 + .../src/plugin/protocol-sftp/ivy.xml | 42 + .../src/plugin/protocol-sftp/plugin.xml | 46 + .../org/apache/nutch/protocol/sftp/Sftp.java | 318 + .../apache/nutch/protocol/sftp/package.html | 5 + .../src/plugin/scoring-link/build.xml | 27 + .../src/plugin/scoring-link/ivy.xml | 41 + .../src/plugin/scoring-link/plugin.xml | 39 + .../link/LinkAnalysisScoringFilter.java | 94 + .../nutch/scoring/link/package-info.java | 23 + .../src/plugin/scoring-opic/build.xml | 27 + .../src/plugin/scoring-opic/ivy.xml | 41 + .../src/plugin/scoring-opic/plugin.xml | 39 + .../nutch/scoring/opic/OPICScoringFilter.java | 172 + .../nutch/scoring/opic/package-info.java | 23 + .../scoring/opic/TestOPICScoringFilter.java | 284 + .../src/plugin/subcollection/README.txt | 10 + .../src/plugin/subcollection/build.xml | 22 + .../src/plugin/subcollection/ivy.xml | 41 + .../src/plugin/subcollection/plugin.xml | 41 + .../nutch/collection/CollectionManager.java | 241 + .../nutch/collection/Subcollection.java | 227 + .../org/apache/nutch/collection/package.html | 36 + .../SubcollectionIndexingFilter.java | 80 + .../indexer/subcollection/package-info.java | 25 + .../nutch/collection/TestSubcollection.java | 112 + apache-nutch-2.3/src/plugin/tld/build.xml | 22 + apache-nutch-2.3/src/plugin/tld/ivy.xml | 41 + apache-nutch-2.3/src/plugin/tld/plugin.xml | 51 + .../nutch/indexer/tld/TLDIndexingFilter.java | 74 + .../org/apache/nutch/indexer/tld/package.html | 5 + .../nutch/scoring/tld/TLDScoringFilter.java | 107 + .../org/apache/nutch/scoring/tld/package.html | 5 + .../indexer/tld/TestTLDIndexingFilter.java | 115 + .../src/plugin/urlfilter-automaton/build.xml | 47 + .../src/plugin/urlfilter-automaton/ivy.xml | 42 + .../src/plugin/urlfilter-automaton/plugin.xml | 43 + .../sample/Benchmarks.rules | 26 + .../sample/Benchmarks.urls | 297 + .../sample/IntranetCrawling.rules | 24 + .../sample/IntranetCrawling.urls | 8 + .../sample/WholeWebCrawling.rules | 19 + .../sample/WholeWebCrawling.urls | 11 + .../automaton/AutomatonURLFilter.java | 107 + .../nutch/urlfilter/automaton/package.html | 9 + .../automaton/TestAutomatonURLFilter.java | 57 + .../src/plugin/urlfilter-domain/build.xml | 28 + .../plugin/urlfilter-domain/data/hosts.txt | 5 + .../src/plugin/urlfilter-domain/ivy.xml | 41 + .../src/plugin/urlfilter-domain/plugin.xml | 43 + .../urlfilter/domain/DomainURLFilter.java | 211 + .../nutch/urlfilter/domain/package-info.java | 25 + .../urlfilter/domain/TestDomainURLFilter.java | 54 + .../src/plugin/urlfilter-prefix/build.xml | 22 + .../src/plugin/urlfilter-prefix/ivy.xml | 41 + .../src/plugin/urlfilter-prefix/plugin.xml | 47 + .../urlfilter/prefix/PrefixURLFilter.java | 180 + .../nutch/urlfilter/prefix/package.html | 5 + .../urlfilter/prefix/TestPrefixURLFilter.java | 63 + .../src/plugin/urlfilter-regex/build.xml | 47 + .../src/plugin/urlfilter-regex/ivy.xml | 41 + .../src/plugin/urlfilter-regex/plugin.xml | 48 + .../urlfilter-regex/sample/Benchmarks.rules | 26 + .../urlfilter-regex/sample/Benchmarks.urls | 297 + .../sample/IntranetCrawling.rules | 27 + .../sample/IntranetCrawling.urls | 8 + .../sample/WholeWebCrawling.rules | 22 + .../sample/WholeWebCrawling.urls | 11 + .../nutch/urlfilter/regex/RegexURLFilter.java | 101 + .../apache/nutch/urlfilter/regex/package.html | 5 + .../urlfilter/regex/TestRegexURLFilter.java | 56 + .../src/plugin/urlfilter-suffix/build.xml | 22 + .../src/plugin/urlfilter-suffix/ivy.xml | 41 + .../src/plugin/urlfilter-suffix/plugin.xml | 47 + .../urlfilter/suffix/SuffixURLFilter.java | 330 + .../nutch/urlfilter/suffix/package-info.java | 23 + .../urlfilter/suffix/TestSuffixURLFilter.java | 121 + .../src/plugin/urlfilter-validator/build.xml | 22 + .../src/plugin/urlfilter-validator/ivy.xml | 41 + .../src/plugin/urlfilter-validator/plugin.xml | 41 + .../urlfilter/validator/UrlValidator.java | 398 ++ .../nutch/urlfilter/validator/package.html | 9 + .../urlfilter/validator/TestUrlValidator.java | 130 + .../src/plugin/urlnormalizer-basic/build.xml | 22 + .../src/plugin/urlnormalizer-basic/ivy.xml | 41 + .../src/plugin/urlnormalizer-basic/plugin.xml | 41 + .../basic/BasicURLNormalizer.java | 209 + .../net/urlnormalizer/basic/package-info.java | 23 + .../basic/TestBasicURLNormalizer.java | 105 + .../src/plugin/urlnormalizer-pass/build.xml | 22 + .../src/plugin/urlnormalizer-pass/ivy.xml | 41 + .../src/plugin/urlnormalizer-pass/plugin.xml | 41 + .../urlnormalizer/pass/PassURLNormalizer.java | 49 + .../net/urlnormalizer/pass/package-info.java | 23 + .../pass/TestPassURLNormalizer.java | 46 + .../src/plugin/urlnormalizer-regex/build.xml | 34 + .../src/plugin/urlnormalizer-regex/ivy.xml | 41 + .../src/plugin/urlnormalizer-regex/plugin.xml | 41 + .../sample/regex-normalize-default.test | 84 + .../sample/regex-normalize-default.xml | 66 + .../sample/regex-normalize-scope1.test | 8 + .../sample/regex-normalize-scope1.xml | 21 + .../regex/RegexURLNormalizer.java | 324 + .../net/urlnormalizer/regex/package-info.java | 23 + .../regex/TestRegexURLNormalizer.java | 146 + apache-nutch-2.3/src/test/crawl-tests.xml | 52 + .../src/test/domain-urlfilter.txt | 22 + apache-nutch-2.3/src/test/filter-all.txt | 7 + apache-nutch-2.3/src/test/gora.properties | 25 + apache-nutch-2.3/src/test/log4j.properties | 7 + apache-nutch-2.3/src/test/nutch-site.xml | 25 + .../test/org/apache/nutch/api/TestAPI.java | 225 + .../org/apache/nutch/crawl/DummyWritable.java | 32 + .../crawl/TestAdaptiveFetchSchedule.java | 120 + .../org/apache/nutch/crawl/TestGenerator.java | 316 + .../org/apache/nutch/crawl/TestInjector.java | 123 + .../nutch/crawl/TestSignatureFactory.java | 36 + .../nutch/crawl/TestURLPartitioner.java | 241 + .../apache/nutch/crawl/TestUrlWithScore.java | 185 + .../org/apache/nutch/fetcher/TestFetcher.java | 156 + .../nutch/indexer/TestIndexingFilters.java | 107 + .../apache/nutch/metadata/TestMetadata.java | 280 + .../metadata/TestSpellCheckedMetadata.java | 302 + .../org/apache/nutch/net/TestURLFilters.java | 48 + .../apache/nutch/net/TestURLNormalizers.java | 72 + .../nutch/parse/TestOutlinkExtractor.java | 95 + .../apache/nutch/parse/TestParserFactory.java | 104 + .../apache/nutch/parse/parse-plugin-test.xml | 58 + .../nutch/plugin/HelloWorldExtension.java | 36 + .../apache/nutch/plugin/ITestExtension.java | 27 + .../apache/nutch/plugin/SimpleTestPlugin.java | 57 + .../apache/nutch/plugin/TestPluginSystem.java | 301 + .../apache/nutch/protocol/TestContent.java | 95 + .../nutch/protocol/TestProtocolFactory.java | 86 + .../apache/nutch/storage/TestGoraStorage.java | 240 + .../apache/nutch/util/AbstractNutchTest.java | 57 + .../org/apache/nutch/util/CrawlTestUtil.java | 154 + .../nutch/util/TestEncodingDetector.java | 103 + .../org/apache/nutch/util/TestGZIPUtils.java | 243 + .../org/apache/nutch/util/TestMimeUtil.java | 127 + .../org/apache/nutch/util/TestNodeWalker.java | 108 + .../nutch/util/TestPrefixStringMatcher.java | 113 + .../org/apache/nutch/util/TestStringUtil.java | 62 + .../nutch/util/TestSuffixStringMatcher.java | 113 + .../org/apache/nutch/util/TestTableUtil.java | 75 + .../org/apache/nutch/util/TestURLUtil.java | 281 + .../apache/nutch/util/WritableTestUtils.java | 56 + .../nutch/webui/client/TestCrawlCycle.java | 123 + .../webui/client/TestNutchClientFactory.java | 72 + .../client/TestRemoteCommandExecutor.java | 105 + .../TestRemoteCommandsBatchFactory.java | 86 + .../nutch/webui/service/NutchServiceTest.java | 66 + .../nutch/webui/view/AbstractWicketTest.java | 32 + .../webui/view/SpringConfigForTests.java | 37 + .../nutch/webui/view/TestColorEnumLabel.java | 60 + .../src/testprocess/gora.properties | 25 + .../fetch-test-site/dup_of_pagea.html | 11 + .../fetch-test-site/exception.html | 13 + .../testresources/fetch-test-site/index.html | 13 + .../fetch-test-site/nested_spider_trap.html | 23 + .../testresources/fetch-test-site/pagea.html | 11 + .../testresources/fetch-test-site/pageb.html | 11 + .../testresources/fetch-test-site/robots.txt | 0 .../testresources/test-mime-util/test.xlsx | Bin 0 -> 3950 bytes .../crawldb/current/part-00000/.data.crc | Bin 0 -> 304 bytes .../crawldb/current/part-00000/.index.crc | Bin 0 -> 12 bytes .../testcrawl/crawldb/current/part-00000/data | Bin 0 -> 37702 bytes .../crawldb/current/part-00000/index | Bin 0 -> 266 bytes .../src/testresources/testcrawl/index/_0.f0 | 1 + .../src/testresources/testcrawl/index/_0.f1 | 1 + .../src/testresources/testcrawl/index/_0.f2 | 1 + .../src/testresources/testcrawl/index/_0.f3 | 1 + .../src/testresources/testcrawl/index/_0.f4 | 1 + .../src/testresources/testcrawl/index/_0.f5 | 1 + .../src/testresources/testcrawl/index/_0.fdt | Bin 0 -> 2450 bytes .../src/testresources/testcrawl/index/_0.fdx | Bin 0 -> 152 bytes .../src/testresources/testcrawl/index/_0.fnm | Bin 0 -> 66 bytes .../src/testresources/testcrawl/index/_0.frq | Bin 0 -> 8675 bytes .../src/testresources/testcrawl/index/_0.prx | Bin 0 -> 17355 bytes .../src/testresources/testcrawl/index/_0.tii | Bin 0 -> 504 bytes .../src/testresources/testcrawl/index/_0.tis | Bin 0 -> 34814 bytes .../testresources/testcrawl/index/deletable | Bin 0 -> 4 bytes .../testresources/testcrawl/index/segments | Bin 0 -> 27 bytes .../indexes/part-00000/.index.done.crc | Bin 0 -> 8 bytes .../indexes/part-00000/.segments.crc | Bin 0 -> 12 bytes .../testcrawl/indexes/part-00000/_j.f0 | 1 + .../testcrawl/indexes/part-00000/_j.f1 | 1 + .../testcrawl/indexes/part-00000/_j.f2 | 1 + .../testcrawl/indexes/part-00000/_j.f3 | 1 + .../testcrawl/indexes/part-00000/_j.f4 | 1 + .../testcrawl/indexes/part-00000/_j.f5 | 1 + .../testcrawl/indexes/part-00000/_j.fdt | Bin 0 -> 2450 bytes .../testcrawl/indexes/part-00000/_j.fdx | Bin 0 -> 152 bytes .../testcrawl/indexes/part-00000/_j.fnm | Bin 0 -> 66 bytes .../testcrawl/indexes/part-00000/_j.frq | Bin 0 -> 8675 bytes .../testcrawl/indexes/part-00000/_j.prx | Bin 0 -> 17355 bytes .../testcrawl/indexes/part-00000/_j.tii | Bin 0 -> 504 bytes .../testcrawl/indexes/part-00000/_j.tis | Bin 0 -> 34814 bytes .../testcrawl/indexes/part-00000/commit.lock | 0 .../testcrawl/indexes/part-00000/deletable | Bin 0 -> 4 bytes .../testcrawl/indexes/part-00000/index.done | 0 .../testcrawl/indexes/part-00000/segments | Bin 0 -> 27 bytes .../testcrawl/indexes/part-00000/write.lock | 0 .../linkdb/current/part-00000/.data.crc | Bin 0 -> 408 bytes .../linkdb/current/part-00000/.index.crc | Bin 0 -> 12 bytes .../testcrawl/linkdb/current/part-00000/data | Bin 0 -> 51198 bytes .../testcrawl/linkdb/current/part-00000/index | Bin 0 -> 266 bytes .../content/part-00000/.data.crc | Bin 0 -> 52 bytes .../content/part-00000/.index.crc | Bin 0 -> 12 bytes .../20060919213635/content/part-00000/data | Bin 0 -> 5270 bytes .../20060919213635/content/part-00000/index | Bin 0 -> 73 bytes .../crawl_fetch/part-00000/.data.crc | Bin 0 -> 12 bytes .../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes .../crawl_fetch/part-00000/data | Bin 0 -> 139 bytes .../crawl_fetch/part-00000/index | Bin 0 -> 73 bytes .../crawl_generate/.part-00000.crc | Bin 0 -> 12 bytes .../20060919213635/crawl_generate/part-00000 | Bin 0 -> 123 bytes .../crawl_parse/.part-00000.crc | Bin 0 -> 52 bytes .../20060919213635/crawl_parse/part-00000 | Bin 0 -> 5584 bytes .../parse_data/part-00000/.data.crc | Bin 0 -> 24 bytes .../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes .../20060919213635/parse_data/part-00000/data | Bin 0 -> 1973 bytes .../parse_data/part-00000/index | Bin 0 -> 73 bytes .../parse_text/part-00000/.data.crc | Bin 0 -> 24 bytes .../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes .../20060919213635/parse_text/part-00000/data | Bin 0 -> 1717 bytes .../parse_text/part-00000/index | Bin 0 -> 73 bytes .../content/part-00000/.data.crc | Bin 0 -> 540 bytes .../content/part-00000/.index.crc | Bin 0 -> 12 bytes .../20060919213643/content/part-00000/data | Bin 0 -> 68087 bytes .../20060919213643/content/part-00000/index | Bin 0 -> 77 bytes .../crawl_fetch/part-00000/.data.crc | Bin 0 -> 20 bytes .../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes .../crawl_fetch/part-00000/data | Bin 0 -> 1527 bytes .../crawl_fetch/part-00000/index | Bin 0 -> 77 bytes .../crawl_generate/.part-00000.crc | Bin 0 -> 20 bytes .../20060919213643/crawl_generate/part-00000 | Bin 0 -> 1239 bytes .../crawl_parse/.part-00000.crc | Bin 0 -> 480 bytes .../20060919213643/crawl_parse/part-00000 | Bin 0 -> 60205 bytes .../parse_data/part-00000/.data.crc | Bin 0 -> 196 bytes .../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes .../20060919213643/parse_data/part-00000/data | Bin 0 -> 23684 bytes .../parse_data/part-00000/index | Bin 0 -> 77 bytes .../parse_text/part-00000/.data.crc | Bin 0 -> 208 bytes .../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes .../20060919213643/parse_text/part-00000/data | Bin 0 -> 25304 bytes .../parse_text/part-00000/index | Bin 0 -> 77 bytes apache-nutch-2.3/urls/seed.txt | 30 + snapshot/eclipse-run.jpg | Bin 0 -> 639274 bytes snapshot/parse-data.jpg | Bin 0 -> 289996 bytes snapshot/solr.png | Bin 0 -> 509462 bytes snapshot/storage-data.jpg | Bin 0 -> 650293 bytes .../WEB-INF/lib/mmseg4j-analysis-1.9.1.jar | Bin 0 -> 9255 bytes .../WEB-INF/lib/mmseg4j-core-1.10.0.jar | Bin 0 -> 702777 bytes .../webapp/WEB-INF/lib/mmseg4j-solr-2.2.0.jar | Bin 0 -> 14952 bytes solr4.10.3/example/solr/README.txt | 63 + .../example/solr/collection1/README.txt | 50 + .../solr/collection1/conf/mongo-connector.log | 0 .../example/solr/collection1/conf/schema.xml | 1178 ++++ .../solr/collection1/conf/solrconfig.xml | 1905 ++++++ .../example/solr/collection1/core.properties | 1 + solr4.10.3/example/solr/solr.xml | 45 + solr4.10.3/example/solr/zoo.cfg | 17 + solr4.10.3/example/start.jar | Bin 0 -> 46294 bytes solr4.10.3/example/webapps/solr.war | Bin 0 -> 30460345 bytes 884 files changed, 100222 insertions(+) create mode 100644 README.md create mode 100644 apache-nutch-2.3/.classpath create mode 100644 apache-nutch-2.3/.gitignore create mode 100644 apache-nutch-2.3/.project create mode 100644 apache-nutch-2.3/.settings/org.apache.ivyde.eclipse.prefs create mode 100644 apache-nutch-2.3/.settings/org.eclipse.core.resources.prefs create mode 100644 apache-nutch-2.3/CHANGES.txt create mode 100644 apache-nutch-2.3/LICENSE.txt create mode 100644 apache-nutch-2.3/NOTICE.txt create mode 100644 apache-nutch-2.3/README.txt create mode 100644 apache-nutch-2.3/build.xml create mode 100644 apache-nutch-2.3/conf/automaton-urlfilter.txt create mode 100644 apache-nutch-2.3/conf/configuration.xsl create mode 100644 apache-nutch-2.3/conf/domain-suffixes.xml create mode 100644 apache-nutch-2.3/conf/domain-suffixes.xsd create mode 100644 apache-nutch-2.3/conf/domain-urlfilter.txt create mode 100644 apache-nutch-2.3/conf/elasticsearch.conf create mode 100644 apache-nutch-2.3/conf/gora-accumulo-mapping.xml create mode 100644 apache-nutch-2.3/conf/gora-cassandra-mapping.xml create mode 100644 apache-nutch-2.3/conf/gora-hbase-mapping.xml create mode 100644 apache-nutch-2.3/conf/gora-mongodb-mapping.xml create mode 100644 apache-nutch-2.3/conf/gora-solr-host-schema.xml create mode 100644 apache-nutch-2.3/conf/gora-solr-mapping.xml create mode 100644 apache-nutch-2.3/conf/gora-solr-webpage-schema.xml create mode 100644 apache-nutch-2.3/conf/gora-sql-mapping.xml create mode 100644 apache-nutch-2.3/conf/gora.properties create mode 100644 apache-nutch-2.3/conf/hbase-site.xml create mode 100644 apache-nutch-2.3/conf/htmlunit-urlfilter.txt create mode 100644 apache-nutch-2.3/conf/httpclient-auth.xml create mode 100644 apache-nutch-2.3/conf/log4j.properties create mode 100644 apache-nutch-2.3/conf/nutch-conf.xsl create mode 100644 apache-nutch-2.3/conf/nutch-default.xml create mode 100644 apache-nutch-2.3/conf/nutch-site.xml create mode 100644 apache-nutch-2.3/conf/parse-plugins.dtd create mode 100644 apache-nutch-2.3/conf/parse-plugins.xml create mode 100644 apache-nutch-2.3/conf/prefix-urlfilter.txt create mode 100644 apache-nutch-2.3/conf/regex-normalize.xml create mode 100644 apache-nutch-2.3/conf/regex-urlfilter.txt create mode 100644 apache-nutch-2.3/conf/schema.xml create mode 100644 apache-nutch-2.3/conf/solrindex-mapping.xml create mode 100644 apache-nutch-2.3/conf/subcollections.xml create mode 100644 apache-nutch-2.3/conf/suffix-urlfilter.txt create mode 100644 apache-nutch-2.3/default.properties create mode 100644 apache-nutch-2.3/src/bin/crawl create mode 100644 apache-nutch-2.3/src/bin/nutch create mode 100644 apache-nutch-2.3/src/gora/host.avsc create mode 100644 apache-nutch-2.3/src/gora/webpage.avsc create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/ConfManager.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/JobManager.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/NutchServer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/JobFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/JobWorker.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/NutchServerPoolExecutor.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/RAMConfManager.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/RAMJobManager.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/db/DbIterator.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/db/DbPageConverter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/db/DbReader.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/impl/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/misc/ErrorStatusService.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/request/DbFilter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/request/JobConfig.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/request/NutchConfig.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/request/SeedList.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/request/SeedUrl.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/response/DbQueryResult.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/response/ErrorResponse.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/response/JobInfo.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/model/response/NutchStatus.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/resources/AbstractResource.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/resources/AdminResource.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/resources/ConfigResource.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/resources/DbResource.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/resources/JobResource.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/api/resources/SeedResource.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/CrawlStatus.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/DbUpdateMapper.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/DbUpdateReducer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/DbUpdaterJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/FetchSchedule.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/GeneratorJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/GeneratorMapper.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/GeneratorReducer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/InjectorJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/MD5Signature.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/NutchWritable.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/Signature.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/SignatureComparator.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/SignatureFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/TextMD5Signature.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/TextProfileSignature.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/URLPartitioner.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/URLWebPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/UrlWithScore.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/WebTableReader.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/crawl/package.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/fetcher/FetchEntry.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/fetcher/FetcherJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/fetcher/FetcherReducer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/fetcher/package.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/host/HostDb.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/host/HostDbReader.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/host/HostDbUpdateJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/host/HostDbUpdateReducer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/host/HostInjectorJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/host/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/CleaningJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexCleaningFilter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexCleaningFilters.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexWriter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexWriters.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexingException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexingFilter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexingFilters.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/IndexingJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/NutchDocument.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/package.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/solr/SolrConstants.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/indexer/solr/SolrUtils.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/CreativeCommons.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/DublinCore.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/Feed.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/HttpHeaders.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/MetaWrapper.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/Metadata.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/Nutch.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/metadata/package.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/URLFilter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/URLFilterChecker.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/URLFilterException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/URLFilters.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/URLNormalizer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/URLNormalizerChecker.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/URLNormalizers.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/protocols/ProtocolException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/protocols/Response.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/net/protocols/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/HTMLMetaTags.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/Outlink.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/OutlinkExtractor.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/Parse.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParseCallable.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParseException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParseFilter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParseFilters.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParsePluginList.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParsePluginsReader.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParseStatusCodes.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParseStatusUtils.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParseUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/Parser.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParserChecker.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParserFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParserJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/ParserNotFound.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/parse/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/CircularDependencyException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/Extension.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/ExtensionPoint.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/FieldPluggable.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/MissingDependencyException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/Pluggable.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/Plugin.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/PluginClassLoader.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/PluginDescriptor.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/PluginManifestParser.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/PluginRepository.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/PluginRuntimeException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/plugin/package.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/Content.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/Protocol.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/ProtocolException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/ProtocolFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/ProtocolNotFound.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/ProtocolOutput.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/RobotRules.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/RobotRulesParser.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/protocol/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/scoring/ScoreDatum.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/scoring/ScoringFilter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/scoring/ScoringFilterException.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/scoring/ScoringFilters.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/scoring/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/Host.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/Mark.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/ParseStatus.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/ProtocolStatus.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/StorageUtils.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/WebPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/WebTableCreator.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/storage/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/Benchmark.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/DmozParser.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/ResolveUrls.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/arc/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/proxy/DelayHandler.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/proxy/FakeHandler.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/proxy/LogDebugHandler.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/proxy/NotFoundHandler.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/tools/proxy/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/Bytes.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/CommandRunner.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/DeflateUtils.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/DomUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/EncodingDetector.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/FSUtils.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/GZIPUtils.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/GenericWritableConfigurable.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/HadoopFSUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/Histogram.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/IdentityPageReducer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/LockUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/MimeUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/NodeWalker.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/NutchConfiguration.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/NutchJob.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/NutchJobConf.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/NutchTool.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/ObjectCache.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/Pair.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/PrefixStringMatcher.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/StringUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/SuffixStringMatcher.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/TableUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/TimingUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/ToolUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/TrieStringMatcher.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/URLUtil.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/WebPageWritable.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/domain/DomainStatistics.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/domain/DomainSuffix.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/domain/DomainSuffixes.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/domain/TopLevelDomain.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/domain/package.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/util/package-info.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/NutchUiApplication.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/NutchUiApplication.properties create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/NutchUiServer.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/NutchClient.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/NutchClientFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/model/ConnectionStatus.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/model/Crawl.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/model/JobConfig.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/model/JobInfo.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/client/model/NutchStatus.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/config/CustomDaoFactory.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/config/CustomTableCreator.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/config/SpringConfiguration.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/model/NutchConfig.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/model/NutchInstance.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/model/SeedList.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/model/SeedUrl.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/AbstractBasePage.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/AbstractBasePage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/DashboardPage.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/DashboardPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/LogOutPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/SchedulingPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/SearchPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/StatisticsPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/UrlsUploadPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/UserSettingsPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/assets/nutch-style.css create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/instances/InstancePanel.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/instances/InstancePanel.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/instances/InstancesPage.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/instances/InstancesPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/menu/VerticalMenu.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/seed/SeedListsPage.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/seed/SeedPage.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/seed/SeedPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/settings/SettingsPage.html create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/pages/settings/SettingsPage.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/CrawlService.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/NutchInstanceService.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/NutchService.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/SeedListService.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java create mode 100644 apache-nutch-2.3/src/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java create mode 100644 apache-nutch-2.3/src/java/overview.html create mode 100644 apache-nutch-2.3/src/plugin/build-plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/build.xml create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/README.txt create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/build.xml create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/conf/crawl-urlfilter.txt create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/conf/nutch-site.xml create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/data/anchor.html create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/data/rdf.html create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/data/rel.html create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html create mode 100644 apache-nutch-2.3/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/feed/build.xml create mode 100644 apache-nutch-2.3/src/plugin/feed/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/feed/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/feed/sample/rsstest.rss create mode 100644 apache-nutch-2.3/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java create mode 100644 apache-nutch-2.3/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java create mode 100644 apache-nutch-2.3/src/plugin/index-anchor/build.xml create mode 100644 apache-nutch-2.3/src/plugin/index-anchor/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/index-anchor/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html create mode 100644 apache-nutch-2.3/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-basic/build.xml create mode 100644 apache-nutch-2.3/src/plugin/index-basic/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/index-basic/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html create mode 100644 apache-nutch-2.3/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-metadata/build.xml create mode 100644 apache-nutch-2.3/src/plugin/index-metadata/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/index-metadata/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java create mode 100644 apache-nutch-2.3/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/index-more/build.xml create mode 100644 apache-nutch-2.3/src/plugin/index-more/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/index-more/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html create mode 100644 apache-nutch-2.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-s2jh/build.xml create mode 100644 apache-nutch-2.3/src/plugin/index-s2jh/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/index-s2jh/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/index-s2jh/src/java/org/apache/nutch/indexer/s2jh/AbstractIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-s2jh/src/java/org/apache/nutch/indexer/s2jh/S2jhDiscardIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/index-s2jh/src/java/org/apache/nutch/indexer/s2jh/S2jhIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/build-ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/build.xml create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/howto_upgrade_es.txt create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/build.xml create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java create mode 100644 apache-nutch-2.3/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/build.xml create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/package.html create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/da.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/de.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/el.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/en.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/es.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fi.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fr.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/it.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/nl.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/pt.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/sv.test create mode 100644 apache-nutch-2.3/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/test-referencial.txt create mode 100644 apache-nutch-2.3/src/plugin/lib-htmlunit/build.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-htmlunit/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-htmlunit/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/ExtHtmlunitCache.java create mode 100644 apache-nutch-2.3/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpWebClient.java create mode 100644 apache-nutch-2.3/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/RegexHttpWebConnection.java create mode 100644 apache-nutch-2.3/src/plugin/lib-http/build.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-http/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-http/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java create mode 100644 apache-nutch-2.3/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java create mode 100644 apache-nutch-2.3/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java create mode 100644 apache-nutch-2.3/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java create mode 100644 apache-nutch-2.3/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html create mode 100644 apache-nutch-2.3/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java create mode 100644 apache-nutch-2.3/src/plugin/lib-nekohtml/build.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-nekohtml/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-nekohtml/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-pinyin/build.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-pinyin/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-pinyin/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-pinyin/src/java/org/apache/solr/pinyin/ChineseToPinyinConvertor.java create mode 100644 apache-nutch-2.3/src/plugin/lib-pinyin/src/java/org/apache/solr/transformer/ChineseToPinyinTransformer.java create mode 100644 apache-nutch-2.3/src/plugin/lib-regex-filter/build.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-regex-filter/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-regex-filter/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java create mode 100644 apache-nutch-2.3/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java create mode 100644 apache-nutch-2.3/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java create mode 100644 apache-nutch-2.3/src/plugin/lib-xml/build.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-xml/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/lib-xml/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/build.xml create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/sample/microformats_reltag_test.html create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java create mode 100644 apache-nutch-2.3/src/plugin/nutch-extensionpoints/build.xml create mode 100644 apache-nutch-2.3/src/plugin/nutch-extensionpoints/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/nutch-extensionpoints/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-ext/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-ext/command create mode 100644 apache-nutch-2.3/src/plugin/parse-ext/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-ext/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-html/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-html/lib/tagsoup.LICENSE.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-html/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java create mode 100644 apache-nutch-2.3/src/plugin/parse-js/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-js/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-js/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-js/sample/parse_embedded_js_test.html create mode 100644 apache-nutch-2.3/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/README.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/sample/testMetatags.html create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/sample/testMultivalueMetatags.html create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/AbstractHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/CrawlData.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/HuanqiuHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/IndeedHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/JtCpdHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/JumeiHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/Position.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/ScjtaqHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/TMallHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-s2jh/src/java/org/apache/nutch/parse/s2jh/VipHtmlParseFilter.java create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/lib/javaswf-LICENSE.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/lib/javaswf.jar create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/sample/test1.swf create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/sample/test1.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/sample/test2.swf create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/sample/test2.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/sample/test3.swf create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/sample/test3.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/build-ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/howto_upgrade_tika.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/encrypted.pdf create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/nutch.html create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/nutch_logo_tm.gif create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/ootest.odt create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/ootest.sxw create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/ootest.txt create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/pdftest.pdf create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/rsstest.rss create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/test.rtf create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/sample/word97.doc create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/build.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/sample/test.zip create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java create mode 100644 apache-nutch-2.3/src/plugin/plugin.dtd create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/build.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/sample/testprotocolfile.txt create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html create mode 100644 apache-nutch-2.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/build.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/build.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/jsp/basic-http.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/jsp/brokenpage.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/jsp/redirect301.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/jsp/redirect302.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/build.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/jsp/basic.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/jsp/cookies.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/jsp/digest.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/jsp/noauth.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/jsp/ntlm.jsp create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-s2jh/build.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-s2jh/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-s2jh/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-s2jh/src/java/org/apache/nutch/protocol/s2jh/Http.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-s2jh/src/java/org/apache/nutch/protocol/s2jh/HttpResponse.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-s2jh/src/java/org/apache/nutch/protocol/s2jh/package.html create mode 100644 apache-nutch-2.3/src/plugin/protocol-sftp/build.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-sftp/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-sftp/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java create mode 100644 apache-nutch-2.3/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/package.html create mode 100644 apache-nutch-2.3/src/plugin/scoring-link/build.xml create mode 100644 apache-nutch-2.3/src/plugin/scoring-link/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/scoring-link/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java create mode 100644 apache-nutch-2.3/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/scoring-opic/build.xml create mode 100644 apache-nutch-2.3/src/plugin/scoring-opic/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/scoring-opic/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java create mode 100644 apache-nutch-2.3/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java create mode 100644 apache-nutch-2.3/src/plugin/subcollection/README.txt create mode 100644 apache-nutch-2.3/src/plugin/subcollection/build.xml create mode 100644 apache-nutch-2.3/src/plugin/subcollection/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/subcollection/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java create mode 100644 apache-nutch-2.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java create mode 100644 apache-nutch-2.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html create mode 100644 apache-nutch-2.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java create mode 100644 apache-nutch-2.3/src/plugin/tld/build.xml create mode 100644 apache-nutch-2.3/src/plugin/tld/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/tld/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html create mode 100644 apache-nutch-2.3/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java create mode 100644 apache-nutch-2.3/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html create mode 100644 apache-nutch-2.3/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/sample/Benchmarks.rules create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/sample/Benchmarks.urls create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-domain/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-domain/data/hosts.txt create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-domain/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-domain/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-prefix/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-prefix/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-prefix/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/sample/Benchmarks.rules create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/sample/Benchmarks.urls create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-suffix/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-suffix/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-suffix/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-validator/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-validator/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-validator/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html create mode 100644 apache-nutch-2.3/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-basic/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-basic/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-basic/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-pass/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-pass/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-pass/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/build.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/ivy.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/plugin.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.xml create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java create mode 100644 apache-nutch-2.3/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java create mode 100644 apache-nutch-2.3/src/test/crawl-tests.xml create mode 100644 apache-nutch-2.3/src/test/domain-urlfilter.txt create mode 100644 apache-nutch-2.3/src/test/filter-all.txt create mode 100644 apache-nutch-2.3/src/test/gora.properties create mode 100644 apache-nutch-2.3/src/test/log4j.properties create mode 100644 apache-nutch-2.3/src/test/nutch-site.xml create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/api/TestAPI.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/crawl/DummyWritable.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/crawl/TestGenerator.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/crawl/TestInjector.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/crawl/TestSignatureFactory.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/crawl/TestURLPartitioner.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/crawl/TestUrlWithScore.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/fetcher/TestFetcher.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/indexer/TestIndexingFilters.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/metadata/TestMetadata.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/net/TestURLFilters.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/net/TestURLNormalizers.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/parse/TestParserFactory.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/parse/parse-plugin-test.xml create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/plugin/HelloWorldExtension.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/plugin/ITestExtension.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/plugin/TestPluginSystem.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/protocol/TestContent.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/protocol/TestProtocolFactory.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/storage/TestGoraStorage.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/AbstractNutchTest.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/CrawlTestUtil.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestEncodingDetector.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestGZIPUtils.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestMimeUtil.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestNodeWalker.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestStringUtil.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestTableUtil.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/TestURLUtil.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/util/WritableTestUtils.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/client/TestCrawlCycle.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/client/TestNutchClientFactory.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/client/TestRemoteCommandExecutor.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/client/TestRemoteCommandsBatchFactory.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/service/NutchServiceTest.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/view/AbstractWicketTest.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/view/SpringConfigForTests.java create mode 100644 apache-nutch-2.3/src/test/org/apache/nutch/webui/view/TestColorEnumLabel.java create mode 100644 apache-nutch-2.3/src/testprocess/gora.properties create mode 100644 apache-nutch-2.3/src/testresources/fetch-test-site/dup_of_pagea.html create mode 100644 apache-nutch-2.3/src/testresources/fetch-test-site/exception.html create mode 100644 apache-nutch-2.3/src/testresources/fetch-test-site/index.html create mode 100644 apache-nutch-2.3/src/testresources/fetch-test-site/nested_spider_trap.html create mode 100644 apache-nutch-2.3/src/testresources/fetch-test-site/pagea.html create mode 100644 apache-nutch-2.3/src/testresources/fetch-test-site/pageb.html create mode 100644 apache-nutch-2.3/src/testresources/fetch-test-site/robots.txt create mode 100644 apache-nutch-2.3/src/testresources/test-mime-util/test.xlsx create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/crawldb/current/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/crawldb/current/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/crawldb/current/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/crawldb/current/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.f0 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.f1 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.f2 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.f3 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.f4 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.f5 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.fdt create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.fdx create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.fnm create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.frq create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.prx create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.tii create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/_0.tis create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/deletable create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/index/segments create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/.index.done.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/.segments.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.f0 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.f1 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.f2 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.f3 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.f4 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.f5 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.fdt create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.fdx create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.fnm create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.frq create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.prx create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.tii create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/_j.tis create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/commit.lock create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/deletable create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/index.done create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/segments create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/indexes/part-00000/write.lock create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/linkdb/current/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/linkdb/current/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/linkdb/current/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/linkdb/current/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/content/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/content/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/content/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/content/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_fetch/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_fetch/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_fetch/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_fetch/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_generate/.part-00000.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_generate/part-00000 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_parse/.part-00000.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/crawl_parse/part-00000 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_data/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_data/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_data/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_data/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_text/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_text/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_text/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213635/parse_text/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/content/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/content/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/content/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/content/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_fetch/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_fetch/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_fetch/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_fetch/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_generate/.part-00000.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_generate/part-00000 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_parse/.part-00000.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/crawl_parse/part-00000 create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_data/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_data/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_data/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_data/part-00000/index create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_text/part-00000/.data.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_text/part-00000/.index.crc create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_text/part-00000/data create mode 100644 apache-nutch-2.3/src/testresources/testcrawl/segments/20060919213643/parse_text/part-00000/index create mode 100644 apache-nutch-2.3/urls/seed.txt create mode 100644 snapshot/eclipse-run.jpg create mode 100644 snapshot/parse-data.jpg create mode 100644 snapshot/solr.png create mode 100644 snapshot/storage-data.jpg create mode 100644 solr4.10.3/example/solr-webapp/webapp/WEB-INF/lib/mmseg4j-analysis-1.9.1.jar create mode 100644 solr4.10.3/example/solr-webapp/webapp/WEB-INF/lib/mmseg4j-core-1.10.0.jar create mode 100644 solr4.10.3/example/solr-webapp/webapp/WEB-INF/lib/mmseg4j-solr-2.2.0.jar create mode 100644 solr4.10.3/example/solr/README.txt create mode 100644 solr4.10.3/example/solr/collection1/README.txt create mode 100644 solr4.10.3/example/solr/collection1/conf/mongo-connector.log create mode 100644 solr4.10.3/example/solr/collection1/conf/schema.xml create mode 100644 solr4.10.3/example/solr/collection1/conf/solrconfig.xml create mode 100644 solr4.10.3/example/solr/collection1/core.properties create mode 100644 solr4.10.3/example/solr/solr.xml create mode 100644 solr4.10.3/example/solr/zoo.cfg create mode 100644 solr4.10.3/example/start.jar create mode 100644 solr4.10.3/example/webapps/solr.war diff --git a/README.md b/README.md new file mode 100644 index 0000000..225cff9 --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +Nutch AJAX page Fetch, Parse, Index Plugin +============== + +### ÏîÄ¿¼ò½é + +»ùÓÚApache Nutch 2.3ºÍHtmlunit, Selenium WebDriverµÈ×é¼þÀ©Õ¹£¬ÊµÏÖ¶ÔÓÚAJAX¼ÓÔØÀàÐÍÒ³ÃæµÄÍêÕûÒ³ÃæÄÚÈÝץȡ£¬ÒÔ¼°Ìض¨Êý¾ÝÏîµÄ½âÎöºÍË÷Òý¡£ + +According to the implementation of Apache Nutch 2.X, we can't get dynamic HTML information from fetch pages including AJAX requests as it will ignore all AJAX requests. + +This plugin will use Htmlunit and Selenium WebDriver to fetch whole page content with necessary dynamic AJAX requests. + +It developed and tested with Apache Nutch 2.3, you can try it on other Nutch 2.X version or refactor the source codes as your design. + +### Ö÷ÒªÌØÐÔ + +* **³£¹æµÄHTMLÒ³Ãæץȡ**: ¶ÔÓÚ³£¹æµÄÀýÈçÐÂÎÅÀàûÓÐAJAXÌØÐÔµÄÒ³Ãæ¿ÉÒÔÖ±½ÓÓÃNutch×Ô´øµÄprotocol-http²å¼þץȡ¡£ + +* **³£¹æµÄAJAXÒ³Ãæץȡ**: ¶ÔÓÚ¾ø´ó²¿·ÖÖîÈçjQuery ajax¼ÓÔصÄÒ³Ã棬¿ÉÒÔÖ±½ÓÓÃhtmlunitÀ©Õ¹²å¼þץȡ¡£ + +* **ÌØÊâµÄAJAXÇëÇóÒ³Ãæץȡ**: ÖîÈçÌÔ±¦/ÌìèµÄÒ³Ãæ²ÉÓÃÁ˶ÀÌصÄKissy Javascript×é¼þ£¬Ä¿Ç°²âÊÔhtmlunitÎÞ·¨ÕýÈ·½âÎö£¬Òò´ËÍ˶øÇóÆä´Î²ÉÓÃЧÂʵÍһЩµÄSelenium WebDriver·½Ê½ÊµÏÖÒ³ÃæÊý¾Ýץȡ¡£ + +* **»ùÓÚÒ³Ãæ¹ö¶¯µÄAJAXÇëÇóÒ³Ãæץȡ**: ÖîÈçÌÔ±¦/ÌìèµÄÉÌÆ·ÏêÇéÒ³Ãæ»á»ùÓÚÒ³Ãæ¹ö¶¯·¢ÆðÉÌÆ·ÃèÊöÐÅÏ¢µÄ¼ÓÔØ£¬Í¨¹ýHtmlunit»òSelenium WebDriverÀ©Õ¹´¦Àí¿ÉÒÔʵÏÖ´ËÀàÒ³ÃæÊý¾Ýץȡ¡£ + +### ÔËÐз½Ê½ + +Õû¸öÏîÄ¿»ùÓÚ¹Ù·½µÄApache Nutch 2.3Ô´Âë»ù´¡Ö®ÉÏÌí¼Ó²å¼þ´úÂëºÍÅäÖã¬ÔËÐз½Ê½ºÍ¹Ù·½Ö¸Äϱ£³ÖÒ»Ö£¬¾ßÌåÇë²Î¿¼£ºhttp://wiki.apache.org/nutch/ + +ͬʱ¹¤³Ì´úÂëÖÐÌá½»ÁËEclipseµÄ¹¤³ÌÅäÖÃÎļþ£¬¿ÉÒÔÖ±½Óimport EclipseÖÐRun»òDebugÔËÐУ¬Nutch¹¤³ÌÒÔIvy½øÐÐÒÀÀµ¹ÜÀí£¬¿É²ÉÓÃANT Build·½Ê½»ò½¨ÒéÔÚEclipse IDE°²×°Apache Ivy IDE²å¼þ½øÐй¤³Ì±àÒëÔËÐС£ + +![snapshot](http://git.oschina.net/xautlx/nutch-ajax/raw/master/snapshot/eclipse-run.jpg) + +![snapshot](http://git.oschina.net/xautlx/nutch-ajax/raw/master/snapshot/storage-data.jpg) + +![snapshot](http://git.oschina.net/xautlx/nutch-ajax/raw/master/snapshot/parse-data.jpg) + +### À©Õ¹²å¼þ˵Ã÷ + +* **lib-pinyin**: ÓÃÓÚparse»òindex²å¼þת»»ÖÐÎĵ½Æ´ÒôÌá½»solr£»²¿ÊðÓÃÓÚsolr dataimporthandler×é¼þ½øÐÐÆ´Òôת»»µÄtransformerÀ©Õ¹²å¼þ + +* **lib-htmlunit**: »ùÓÚHtmlunitµÄ¶àÏ̴߳¦Àí£¬»º´æ¿ØÖÆ£¬ÇëÇóÕýÔò¿ØÖƵÈÌØÐÔÀ©Õ¹²å¼þ + +* **protocol-s2jh**: »ùÓÚHtmlunitºÍSelenium WebDriverʵÏÖµÄAJAXÒ³ÃæFetcher²å¼þ + +* **parse-s2jh**: »ùÓÚXPath½âÎöÒ³ÃæÔªËØÄÚÈÝ; ³Ö¾Ã»¯½âÎöµ½µÄ½á¹¹»¯Êý¾Ý£¬ÈçMySQL£¬MongoDBµÈ; ¶ÔÓÚ¸ö±ð¸´ÔÓÀàÐÍAJAXÒ³Ã涨ÖÆÅжÏÒ³Ãæ¼ÓÔØÍê³ÉµÄ»Øµ÷ÅжÏÂß¼­ + +* **index-s2jh**: ×·¼ÓÉèÖÃÐèÒª¶îÍâ´«µÝ¸øSOLRË÷ÒýµÄÊôÐÔÊý¾Ý; É趨²»ÐèÒªË÷ÒýµÄÒ³Ãæ¹æÔò; + +### Ïêϸ²Î¿¼Îĵµ + +ÏîÄ¿Ìṩһ·Ý±È½ÏÏêϸµÄ¡°»ùÓÚNutch&Solr¶¨Ïò²É¼¯½âÎöºÍË÷ÒýËÑË÷µÄÕûºÏ¼¼ÊõÖ¸ÄÏÎĵµ¡±£¬¿Éͨ¹ýÒÔÏÂÁ½ÖÖ·½Ê½²é¿´²Î¿¼ÎĵµÄÚÈÝ£º + +* Ö±½Ó»ñÈ¡ÏîÄ¿ÄÚÈݺó£¬ÔÚdocumentĿ¼Ï¸ù¾Ý×Ô¼ºÊìϤµÄ±à¼­Æ÷²é¿´¶ÔÓ¦µÄmd»òhtml¸ñʽÎĵµ£» +* GitHubÖ±½Ó½âÎömdÎļþ£¬²¢ÇÒÄÜÕýÈ·´¦ÀíͼƬÁ´½Ó£¬Òò´Ë¿ÉÖ±½ÓÔÚÏß·ÃÎÊ https://github.com/xautlx/nutch-ajax/blob/master/document/Apache_Nutch_Solr_Solution_with_AJAX_support.md + +### Ðí¿É˵Ã÷ + +* Free Open Source + +±¾ÏîÄ¿ËùÓдúÂëÍêÕû¿ªÔ´£¬ÔÚ±£Áô±êʶ±¾ÏîÄ¿À´Ô´ÐÅÏ¢ÒÔ¼°±£Ö¤²»¶Ô±¾ÏîÄ¿½øÐзÇÊÚȨµÄÏúÊÛÐÐΪµÄÇ°ÌáÏ£¬¿ÉÒÔÒÔÈÎÒⷽʽ×ÔÓÉÃâ·ÑʹÓ㺿ªÔ´¡¢·Ç¿ªÔ´¡¢ÉÌÒµ¼°·ÇÉÌÒµ¡£ + +* Charge Support Service + +Èç¹ûÄ㻹ÓÐÐËȤÔÚApache Nutch/Solr/LuceneµÈϵÁм¼ÊõµÄ¶¨ÖƵÄÀ©Õ¹ÊµÏÖ/¼¼Êõ×Éѯ·þÎñ/±ÏÒµÉè¼ÆÖ¸µ¼/¶þ´Î¿ª·¢ÏîÄ¿Ö¸µ¼µÈ·½ÃæµÄºÏ×÷ÒâÏò£¬¿ÉÁªÏµ E-Mail: s2jh-dev@hotmail.com »ò QQ: 2414521719 (¼ÓQÇë×¢Ã÷£ºnutch/solr/lucene) Ǣ̸¡£[ÉÏÊöÁªÏµ·½Ê½Ë¡²»Ö±½ÓÌṩ×ÉѯÀàѯÎÊ£¬ÎªÁËÌáÉýÏîÄ¿»îÔ¾¶È£¬Èô¶ÔÏîÄ¿ÓÐÈκμ¼ÊõÎÊÌâ»òIssue·´À¡£¬ÇëÖ±½ÓÌá½»µ½ÏîÄ¿Õ¾µãÌáÎÊ»òGitƽ̨µÄIssue] + +### Reference + +»¶Ó­¹Ø×¢×÷ÕßÆäËûÏîÄ¿£º + +* [Nutch 2.X AJAX Plugins (Active)](https://github.com/xautlx/nutch-ajax) - »ùÓÚApache Nutch 2.3ºÍHtmlunit, Selenium WebDriverµÈ×é¼þÀ©Õ¹£¬ÊµÏÖ¶ÔÓÚAJAX¼ÓÔØÀàÐÍÒ³ÃæµÄÍêÕûÒ³ÃæÄÚÈÝץȡ£¬ÒÔ¼°Ìض¨Êý¾ÝÏîµÄ½âÎöºÍË÷Òý + +* [S2JH4Net (Active)](https://github.com/xautlx/s2jh4net) - »ùÓÚSpring MVC+Spring+JPA+HibernateµÄÃæÏò»¥ÁªÍø¼°ÆóÒµWebÓ¦Óÿª·¢¿ò¼Ü + +* [S2JH (Deprecated)](https://github.com/xautlx/s2jh) - »ùÓÚStruts2+Spring+JPA+HibernateµÄÃæÏòÆóÒµWebÓ¦Óÿª·¢¿ò¼Ü + +* [Nutch 1.X AJAX Plugins (Deprecated)](https://github.com/xautlx/nutch-htmlunit) - »ùÓÚApache Nutch 1.XºÍHtmlunitµÄÀ©Õ¹ÊµÏÖAJAXÒ³ÃæÅÀ³æץȡ½âÎö²å¼þ + +* [12306 Hunter (Deprecated)](https://github.com/xautlx/12306-hunter) - £¨¹¦ÄÜÒÑʧЧ²»¿ÉÓ㬲»¹ý»¹¿ÉÒÔµ±×÷Swing¿ª·¢ÑùÁвο¼Ö»Óã©Java Swing C/S°æ±¾12306¶©Æ±ÖúÊÖ£¬Óô¦Ä㶮µÄ \ No newline at end of file diff --git a/apache-nutch-2.3/.classpath b/apache-nutch-2.3/.classpath new file mode 100644 index 0000000..d09c620 --- /dev/null +++ b/apache-nutch-2.3/.classpath @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/.gitignore b/apache-nutch-2.3/.gitignore new file mode 100644 index 0000000..cde7984 --- /dev/null +++ b/apache-nutch-2.3/.gitignore @@ -0,0 +1,3 @@ +/runtime/ +/build/ +/hadoop.log diff --git a/apache-nutch-2.3/.project b/apache-nutch-2.3/.project new file mode 100644 index 0000000..561bfd2 --- /dev/null +++ b/apache-nutch-2.3/.project @@ -0,0 +1,19 @@ + + + apache-nutch-2.3 + + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + org.apache.ivyde.eclipse.ivynature + + diff --git a/apache-nutch-2.3/.settings/org.apache.ivyde.eclipse.prefs b/apache-nutch-2.3/.settings/org.apache.ivyde.eclipse.prefs new file mode 100644 index 0000000..a95f744 --- /dev/null +++ b/apache-nutch-2.3/.settings/org.apache.ivyde.eclipse.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +org.apache.ivyde.eclipse.standaloneretrieve= diff --git a/apache-nutch-2.3/.settings/org.eclipse.core.resources.prefs b/apache-nutch-2.3/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..99f26c0 --- /dev/null +++ b/apache-nutch-2.3/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding/=UTF-8 diff --git a/apache-nutch-2.3/CHANGES.txt b/apache-nutch-2.3/CHANGES.txt new file mode 100644 index 0000000..622ddbc --- /dev/null +++ b/apache-nutch-2.3/CHANGES.txt @@ -0,0 +1,2090 @@ +Nutch Change Log + +Nutch 2.3 Release 08012015 (ddmmyyyy) +Release Report - http://s.apache.org/nutch_2.3 + +* NUTCH-1779 Apply formatting to the code (lewismc) + +* NUTCH-1907 Incorrect output of Outlinks to Hosts within HostDbUpdateReducer (lewismc) + +* NUTCH-1856 Document webpage.avsc and host.avsc (lewismc) + +* NUTCH-1834 GeneratorMapper behavior depends on log level (Gerhard Gossen via snagel) + +* NUTCH-1899 upgrade restlet lib to prevent build failure (talat) + +* NUTCH-1797 remove unused package o.a.n.html (Saurabh Chhajed via snagel) + +* NUTCH-1888 Specify HTMLMapper to use in TikaParser (Halil Simsek via jnioche) + +* NUTCH-1897 Easier debugging of plugin XML errors (markus) + +* NUTCH-1823 Upgrade to elasticsearch 1.4.1 (Phu Kieu, markus, lewismc) + +* NUTCH-1829 Generator : unable to distinguish real errors (Mathieu Bouchard, jnioche, snagel) + +* NUTCH-1778 Generator not logging number of URLs in batch correctly (jnioche via snagel) + +* NUTCH-1877 Suffix URL filter to ignore query string by default (markus via snagel) + +* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel) + +* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel) + +* NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying Wang, snagel) + +* NUTCH-1880 URLUtil should not add additional slashes for file URLs (snagel) + +* NUTCH-1879 Regex URL normalizer should remove multiple slashes after file: protocol (snagel) + +* NUTCH-1820 remove field "orig" which duplicates "id" (lewismc, snagel) + +* NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib) + +* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel) + +* NUTCH-1882 ant eclipse target to add output path to src/test (snagel) + +* NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel) + +* NUTCH-1876 Upgrade to Crawler Commons 0.5 (jnioche) + +* NUTCH-1866 ant eclipse target should not delete runtime (nimafl via lewismc) + +* NUTCH-1859 Make Nutch webapp port configurable (Nima Falaki via lewismc) + +* NUTCH-1848 Bug in DashboardPage.html instances counter (Nima Falaki via lewismc) + +* NUTCH-841 Create a Wicket-based Web Application for Nutch (Fjodor Vershinin via lewismc) + +* NUTCH-1832 Make Nutch work without an indexer (mattmann via lewismc) + +* NUTCH-1840 the describe function in SolrIndexWriter is not correct (kaveh minooie via jnioche) + +* NUTCH-1837 Upgrade to Tika 1.6 (lewismc) + +* NUTCH-1829 Generator : unable to distinguish real errors (Mathieu Bouchard via jnioche) + +* NUTCH-1828 bin/crawl : incorrect handling of nutch errors (Mathieu Bouchard via jnioche) + +* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel) + +* NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel) + +* NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc) + +* NUTCH-1708 use same id when indexing and deleting redirects (snagel) + +* NUTCH-1817 Remove pom.xml from source (jnioche) + +* NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel) + +* NUTCH-1776 Log incorrect plugin.folder file path (Diaa via snagel) + +* NUTCH-1566 bin/nutch to allow whitespace in paths (tejasp, snagel) + +* NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel) + +* NUTCH-385 Improve description of thread related configuration for Fetcher (jnioche,lufeng) + +* NUTCH-1798 Crawl script not calling index command correctly (Aaron Bedward via jnioche) + +* NUTCH-1769 REST API refactoring (Fjodor Vershinin via lewismc) + +* NUTCH-1633 slf4j is provided by hadoop and should not be included in the job file (kaveh minooie via jnioche) + +* NUTCH-1787 update and complete API doc overview page (snagel) + +* NUTCH-1767 remove special treatment of "params" in relative links (snagel) + +* NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel) + +* NUTCH-1796 Ensure Gora object builders are used as oppose to empty constructors (snagel via lewismc) + +* NUTCH-1590 [SECURITY] Frame injection vulnerability in published Javadoc (jnioche) + +* NUTCH-1736 Can't fetch page if http response header contains Transfer-Encoding:chunked (ysc via jnioche) + +* NUTCH-1782 NodeWalker to return current node (markus) + +* NUTCH-1781 Update gora-*-mapping.xml and gora.proeprties to reflect Gora 0.4 (lewismc) + +* NUTCH-1768 Upgrade to ElasticSearch 1.1.0 (jnioche) + +* NUTCH-1634 readdb -stats shows the result twice (kaveh minooie via jnioche) + +* NUTCH-1780 ttl and gc_grace_seconds attributes are missing from gora-cassandra-mapping.xml file (kaveh minooie via lewismc) + +* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus) + +* NUTCH-1674 Use batchId filter to enable scan (GORA-119) for Fetch,Parse,Update,Index (Tien Nguyen Manh and Alparslan Avcı via jnioche) + +* NUTCH-1714 Upgrade to Gora 0.4 (Alparslan Avcı via jnioche) + +* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel) + +* NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche) + +* NUTCH-1182 fetcher to log hung threads (snagel) + +* NUTCH-1618 Turn speculative execution off for Fetching (talat) + +* NUTCH-1657 ORIGINAL_CHAR_ENCODING and CHAR_ENCODING_FOR_CONVERSION never set in HTMLParser (talat) + +* NUTCH-1725 CleaningJob's reducer does not commit deleted docs. (ilhamikalkan via talat) + +* NUTCH-1728 indexer-solr plugin is not delete docs from Solr (ilhamikalkan via talat) + +* NUTCH-1753 Eclipse dependecy problem for 2.x (talat) + +* NUTCH-1720 Duplicate lines in HttpBase.java (Walter Tietze via jnioche) + +* NUTCH-797 URL not properly constructed when link target begins with a "?" (Doug Cook, Robert Hohman, Stondet, ab via snagel) + +* NUTCH-1759 Upgrade to Crawler Commons 0.4 (jnioche) + +* NUTCH-1700 Remove deprecated code in src/plugin/creativecommons/build.xml (lewismc) + +* NUTCH-1761 Crawl script fails to find job file if not started from inside bin dir (David Hosking, jnioche) + +* NUTCH-1603 ZIP parser complains about truncated PDF file (snagel via lewismc) + +* NUTCH-1743 parsechecker to show outlinks (snagel) + +* NUTCH-1732 Better cmd line parsing for NutchServer (Fjodor Vershinin via lewismc) + +* NUTCH-1751 Empty anchors should not index (Sertac TURKEL via lewismc) + +* NUTCH-1733 parse-html to support HTML5 charset definitions (snagel) + +* NUTCH-1727 Configurable length for Tlds (Sertac TURKEL via lewismc) + +* NUTCH-1738 Expose number of URLs generated per batch in GeneratorJob (Talat UYARER via lewismc) + +* NUTCH-1671 indexchecker to add digest field (snagel, lufeng) + +* NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin Kılınç, lufeng, Sertac TURKEL via snagel) + +* NUTCH-1478 Parse-metatags and index-metadata plugin for Nutch 2.x series (kiran, Nguyen Manh Tien, Talat UYARER, Vangelis Karvounis via lewismc) + +* NUTCH-1729 Upgrade to Tika 1.5 (jnioche) + +* NUTCH-1721 Upgrade to Crawler commons 0.3 (tejasp) + +* NUTCH-1719 DomainStatistics fails in 2.x because URL is not unreversed (Gerhard Gossen via lewismc) + +* NUTCH-1253 Incompatable neko and xerces versions (snagel, lewismc, Talat UYARER) + +* NUTCH-1715 RobotRulesParser adds additional '*' to the robots name (tejasp) + +* NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, DoÄŸacan Güney via markus) + +* NUTCH-1164 Write JUnit tests for protocol-http (Sertac TURKEL via tejasp) + +* NUTCH-1710 Add gora package logging to log4j.properties (lewismc) + +* NUTCH-1655 Indexer Plugin for Elastic Search (Talat UYARER via lewismc) + +* NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel via lewismc) + +* NUTCH-1568 port pluggable indexing architecture to 2.x (Talat UYARER via lewismc) + +* NUTCH-1672 Inlinks are added twice in DbUpdateReducer (Tien Nguyen Manh via lewismc) + +* NUTCH-1667 Updatedb always ignore batchId (Tien Nguyen Manh via lewismc) + +* NUTCH-1695 NutchDocument.toString() (markus via lewismc) + +* NUTCH-1696 Enable use of (Gora) SNAPSHOT dependencies (lewismc) + +* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly (Ä°lhami KALKAN, snagel, markus via lewismc) + +* NUTCH-1673 Title isn't reset in MoreIndexingFilter (Nguyen Manh Tien via lewismc) + +* NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche) + +* NUTCH-1651 modifiedTime and prevmodifiedTime never set (Talat UYARER via lewismc) + +* NUTCH-1360 Suport the storing of IP address connected to when web crawling (ferdy, lewismc, Yasin Kılınç) + +* NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays db_unfetched in CrawlDb and is generated over and over again to 2.x (Talat UYARER via lewismc) + +* NUTCH-1650 Adaptive Fetch Scheduler interval Wrong Set (Talat UYARER via lewismc) + +* NUTCH-1413 Record response time (Yasin KILINC, Talat UYARER, snagel via lewismc) + +* NUTCH-1125 JUnit test for tld (Sertac TURKEL via lewismc) + +* NUTCH-1124 JUnit test for scoring-opic (Talat UYARER via lewismc) + +* NUTCH-1641 Log timings for main jobs (jnioche) + +* NUTCH-1556 enabling updatedb to accept batchId (kaveh minooie,Feng) + +* NUTCH-1619 Writes Dmoz Description and Title information to db with snippet argument ( Yasin Kılınç via feng) + +* NUTCH-1631 Display Document Count Added To Solr Server (Furkan KAMACI via lewismc) + +* NUTCH-1629 Injector skips empty lines in seed files (kaveh minooie via jnioche) + +* NUTCH-1624 Typo in WebTableReader line 486 (kaveh minooie via lewismc) + +* NUTCH-1294 IndexClean job with solr implementation. (Dan Rosher, lewismc, Claudiu Chis via feng) + +* NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via snagel) + +* NUTCH-1587 misspelled property "threshold" in conf/log4j.properties (snagel) + +* NUTCH-1604 ProtocolFactory not thread-safe (jnioche) + +* NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus) + +* NUTCH-1594 count variable is never changed in ParseUtil class (Canan via Feng) + +Release 2.2.1 - 06/27/2013 (mm/dd/yyyy) +Release Report - http://s.apache.org/PGa + +* NUTCH-1591 Incorrect conversion of ByteBuffer to String (Jason Howes via lewismc) + +* NUTCH-1571 SolrInputSplit doesn't implement Writable and crawl script doesn't pass crawlId to generate and updatedb tasks (yuanyun.cn via lewismc) + +* NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus) + +* NUTCH-1585 Ensure duplicate tags do not exist in microformat-reltag tag set (lewismc) + +* NUTCH-1475 Index-More Plugin -- A better fall back value for date field (James Sullivan, snagel via lewismc) + +* NUTCH-1420 Get rid of the dreaded � (markus + lewismc) + +* NUTCH-1578 Upgrade to Hadoop 1.2.0 (markus) + +* NUTCH-1522 Upgrade to Tika 1.3 (jnioche) + +Release 2.2 - 05/31/2013 (mm/dd/yyyy) +Jira Release Report - http://s.apache.org/LPB + +* NUTCH-1576 Need to keep hotStore.flush() exception catching (James Sullivan via lewismc) + +* NUTCH-1577 Add target for creating eclipse project (tejasp via lewismc) + +* NUTCH-1545 capture batchId and remove references to segments in 2.x crawl script. (Feng) + +* NUTCH-1575 support solr authentication in nutch 2.x (Feng) + +* NUTCH-1569 Upgrade 2.x to Gora 0.3 (lewismc) + +* NUTCH-1243 Junit jar removed from lib (lewismc) + +* NUTCH-1249 and NUTCH-1275 : Resolve all issues flagged up by adding javac -Xlint argument (tejasp) + +* NUTCH-1513 Support Robots.txt for Ftp urls (tejasp) + +* NUTCH-1053 Parsing of RSS feeds fails (tejasp) + +* NUTCH-1563 FetchSchedule#getFields is never used by GeneratorJob (Feng) + +* NUTCH-1573 Upgrade to most recent JUnit 4.x to improve test flexibility (lewismc) + +* Added crawler-commons dependency in pom.xml (tejasp) + +* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel) + +* NUTCH-1277 Fix [fallthrough] javac warnings (tejasp) + +* NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp) + +* NUTCH-1273 Fix [deprecation] javac warnings (lewsimc + tejasp) + +* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp) + +* NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via tejasp) + +* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + lewismc) + +* NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc) + +* NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via lewismc) + +* NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng) + +* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel) + +* NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel via lewismc) + +* NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc) + +* NUTCH-1532 Replace 'segment' mapping field with batchId (patches v2 + v3) (Feng +via lewismc) + +* NUTCH-1533 Implement getPrevModifiedTime(), setPrevModifiedTime(), getBatchId() and setBatchId() accessors in o.a.n.storage.WebPage (Feng via lewismc) + +* NUTCH-XX fix Elastic Search Ivy configuration (Binoy d via lewismc) + +* NUTCH-1542 "adddays" param for generator not present in 2.x (tejasp) + +* NUTCH-1393 Display consistent usage of GeneratorJob with 1.X (Lufeng +via lewismc) + +* NUTCH-1540 Add Gora buffered read and write maximum limits to nutch-default.xml configuration. (lewismc) + +* NUTCH-842 AutoGenerate WebPage code (jnioche via lewismc) + +* NUTCH-1536 Ant build file has hardcoded conf dir location (zm via lewismc) + +* NUTCH-XX remove unused db.max.inlinks property in nutch-default.xml (lewismc) + +* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (tejasp) + +* NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc) + +* NUTCH-1274 Fix [cast] javac warnings (tejasp via lewismc) + +* NUTCH-1516 Nutch 2.x pom.xml out of sync with ivy.xml (lewismc) + +* NUTCH-1510 Upgrade to Hadoop 1.1.1 (markus) + +* NUTCH-1503 Configuration properties not in sync between FetcherReducer and nutch-default.xml (snagel + lewismc) + +* NUTCH-1394 backport NUTCH-1232 Remove site field from index-basic (lewismc) + +* NUTCH-1370 Expose exact number of urls injected @runtime (ferdy, snagel and lewismc) + (includes commit for NUTCH-1471 make explicit which datastore urls are injected to) + +* NUTCH-1484 TableUtil unreverseURL fails on file:// URLs (Rogério Pereira Araújo via snagel) + +* NUTCH-1451 Upgrade automaton jar to 1.11-8 (lewismc) + +* NUTCH-1496 ParserJob logs skipped urls with level info (Nathan Gass via lewismc) + +* NUTCH-1488 bin/nutch to run junit from any directory (snagel via lewismc) + +* NUTCH-1493 Error adding field 'contentLength'='' during solrindex using index-more (Nathan Gass via lewismc) + +* NUTCH-1491 Strip UTF-8 non-character codepoints in title (Nathan Gass via markus) + +* NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns (snagel) + +* NUTCH-1433 Upgrade to Tika 1.2 (jnioche) + +* NUTCH-1087 Deprecate crawl command and replace with example script (jnioche) + +* NUTCH-874 Make sure all plugins in src/plugin are compatible with Nutch 2.0 and Gora (part 1) (Kiran Chitturi via lewismc) + +* NUTCH-1344 BasicURLNormalizer to normalize https same as http (snagel) + +* NUTCH-706 Url regex normalizer: pattern for session id removal not to match "newsId" (Meghna Kukreja via snagel) + +Release 2.1 (19/09/2012) ddmmyyyy +Full Jira Report - https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=10680&version=12321040 + +* NUTCH-1415 release packages to contain top level folder apache-nutch-x.x (snagel) + +* NUTCH-1432 property storage.schema does not work anymore, should be storage.schema.webpage and storage.schema.host (lewismc) + +* NUTCH-1468 Redirects that are external links not adhering to db.ignore.external.links (Matt MacDonald via ferdy) + +* NUTCH-1470 Ensure test files are included for runtime testing (lewismc) + +* NUTCH-1162 Write JUnit tests for parse-js (lewismc) + +* NUTCH-1161 Write JUnit tests for microformats-reltag plugin (lewismc) + +* NUTCH-1160 Write JUnit tests for index-basic (lewismc) + +* NUTCH-1456 Updater not setting batchId in markers correctly. (Alexander Kingson via ferdy) + +* NUTCH-1459 Remove dead code (phase2) from InjectorJob (ferdy) + +* NUTCH-1431 Introduce link 'distance' and add configurable max distance in the generator (ferdy) + +* NUTCH-1448 Redirected urls should be handled more cleanly (more like an outlink url) (ferdy) + +* NUTCH-1463 Elasticsearch indexer should wait and check response for last flush (ferdy) + +* NUTCH-1462 Elasticsearch not indexing when type==null in NutchDocument metadata (ferdy) + +* NUTCH-1395 Show batchId when skipping within ParserJob (lewismc) + +* NUTCH-1365 Fix crawlId functionalilty by making using of new gora configuration (ferdy) + +* NUTCH-1442 indexingfilter.order is property is misread in code (ferdy via lewismc) + +* NUTCH-1450 Upgrade to gora deps to 0.2.1 except gora-cassandra (lewismc) + +* NUTCH-1159 Write JUnit test for index-anchor (ferdy + lewismc) + +* NUTCH-1445 Add ElasticIndexerJob that indexes to elasticsearch (ferdy) + +* NUTCH-1444 Indexing should not create temporary files (do not extend from FileOutputFormat) (ferdy) + +* NUTCH-1443 Solr schema version is invalid (markus) + +* NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy) + +* NUTCH-1417 Remove o.a.n.metadata.Office (lewismc) + +* NUTCH-1376 add ant description parameters (lewismc) + +* NUTCH-1440 reconfigure non-existent stopwords_en.txt in schema-solr4.xml (shekhar sharma via lewismc) + +* NUTCH-1439 Define boost field as type float in schema-solr4.xml (shekhar sharma via lewismc) + +* NUTCH-1438 ParserJob support for option -reparse (ferdy) + +* NUTCH-1437 HostInjectorJob to accept lines with or without protocol (ferdy) + +* NUTCH-1435 Host jobs throw NullPointerException with MySQL (ferdy via lewismc) + +* NUTCH-1428 GeneratorMapper should not initialize filters/normalizers when they are disabled (ferdy) + +* NUTCH-1427 Reuse SelectorEntry in Generator. (ferdy) + +* NUTCH-1411 nutchgora fetcher.store.content does not work (Alexander Kingson via ferdy) + +* NUTCH-1426 HostDb close() should close store instead of flush (ferdy) + +* NUTCH-1425 DbUpdaterJob declares PREV_SIGNATURE on input twice (ferdy) + +* NUTCH-1424 fix fetcher timelimit logging (ferdy) + +* NUTCH-1423 Remove unused fields in LanguageIndexingFilter (ferdy) + +* NUTCH-1306 Add option to not commit and clarify existing solr.commit.size (ferdy) + +Release 2.0 (08/06/2012) ddmmyyy +Full Jira report - https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=10680&version=12314893 + +* NUTCH-1391 readdb -stats fires java.io.EOFException (jnioche) + +* NUTCH-1400 Remove developer -core option for bin/nutch (jnioche) + +* NUTCH-1399 TestProtocolHttpClient fails (jnioche) + +* NUTCH-1404 Nutch script fails to find job file in deploy mode (sidabatra, jnioche) + +* NUTCH-1401 Upgrade to Hadoop 1.0.3 (jnioche) + +* NUTCH-1396 Upgrade Tika 1.1 (jnioche) + +* NUTCH-1392 -force and -resume arguments being ignored in ParserJob (ferdy via lewismc) + +* NUTCH-1379 NPE when reprUrl is null in ParseUtil (ferdy) + +* NUTCH-1378 HostDb NullPointerException (ferdy) + +* NUTCH-XX Commit to add configuration for separation of ant distribution targets (lewismc + jnioche) + +* NUTCH-1364 Add a counter for malformed urls (Jason Trost via lewismc) + +* NUTCH-1361 Fix mishandling of malformed urls in generator job (Jason Trost via lewismc) + +* NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy) + +* NUTCH-1362 Fix error handling of urls with empty fields (lewis, ferdy) + +* NUTCH-1026 Strip UTF-8 non-character codepoints (markus, ferdy) + +* NUTCH-1358 Do not accept bogus arguments (ferdy) + +* NUTCH-1349 Make batchId explcit within debug logging and improve CLI (lewismc + ferdy) + +* NUTCH-1352 Improve regex urlfilters/normalizers synchronization (ferdy) + +* NUTCH-1356 ParseUtil use ExecutorService instead of manually thread handling. (ferdy) + +* NUTCH-1355 nutchgora Configure minimum throughput for fetcher (ferdy) + +* NUTCH-1354 nutchgora support fetcher.queue.depth.multiplier property (ferdy) + +* NUTCH-1353 nutchgora DomainStatistics support crawlId, counter bug and reformatting (ferdy) + +* NUTCH-1350 remove unused dependancy because of access restriction (ferdy) + +* NUTCH-1205 Upgrade gora modules to 0.2 in ivy/ivy.xml (lewismc, ferdy) + +* NUTCH-882 Design a Host table in GORA (jnioche, ab, dogacan, Mathijs Homminga, ferdy) + +* NUTCH-1340 Increase scalability by only removing markers when they actually exist for DbUpdaterReducer (ferdy) + +* NUTCH-1333 Introduce AvroStore, DataFileAvroStore and Accumulo Datastore implementations (lewismc) + +* NUTCH-1312 Nutchgora to send HTTP-accept header (ferdy) + +* NUTCH-1311 Add response headers to datastore for the protocol-httpclient plugin (Dan Rosher via ferdy) + +* NUTCH-1304 GeneratorMapper.java dosen't return when skipping and already generated mark (Dan Rosher via lewismc) + +* NUTCH-1307 Improve formatting of ant targets for clearer project help (lewismc) + +* NUTCH-1302 nutchgora job failures should be noticed by submitter (ferdy) + +* NUTCH-1298 Pass numTasks to FetcherJob (Dan Rosher via ferdy) + +* NUTCH-1289 In distributed mode URL's are not partitioned (Dan Rosher, ferdy) + +* NUTCH-1292 Better exception logging and debugging during fetch. (ferdy) + +* NUTCH-1263 FetcherJob must put 'fetchTime' on input (ferdy) + +* NUTCH-1296 nutchgora fetcher does not show correct 'threads' and 'resuming' properties (ferdy) + +* NUTCH-1295 nutchgora restlet dependencies failing when remote repos is down (ferdy) + +* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy) + +* NUTCH-1287 Upgrade to hsqldb 2.2.8 (ferdy) + +* NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain (ferdy) + +* NUTCH-1246 Upgrade to Hadoop 1.0.0 (lewismc) + +* NUTCH-1279 Check if limit has been reached in GeneraterReducer must be the first check performance-wise. (ferdy) + +* NUTCH-1255 Change ivy.xml of all plugins to remove "nutch.root" property (ferdy) + +* NUTCH-1189 add commented out default settings to gora.properties file (lewismc, Ferdy) + +* NUTCH-1138 remove LogUtil from trunk and nutchgora (lewismc) + +* NUTCH-1237 Improve javac arguements for more verbose output (lewismc) + +* NUTCH-1217 Update NOTICE.txt to drop some copyrights (lewismc) + +* NUTCH-1216 Add trivial comment to lib/native/README.txt (lewismc) + +* NUTCH-1198 Less verbose logging when unmapped mimetypes are trying to be parsed. (ferdy) + +* NUTCH-1196 Update job should impose an upper limit on the number of inlinks (nutchgora) (ferdy) + +* NUTCH-1185 Decrease solr.commit.size to 250 (markus) + +* NUTCH-1172 AbstractNuchTest should have a generic testdir instead of specific 'inject' dir (ferdy) + +* NUTCH-1192 Add '/runtime' to svn ignore (ferdy) + +* NUTCH-1191 Port NUTCH-1102 to nutchgora - consistent use of fetcher.parse (ferdy) + +* NUTCH-1187 Port NUTCH-1028 to nutchgora - log parser keys (ferdy) + +* NUTCH-902 Add all necessary files and configuration so that nutch can be used with different backends out-of-the-box (lewismc) + +* NUTCH-1081 & 1135 ant tests fail & Fix TestGoraStorage for Nutchgora (Ferdy via lewismc) + +* NUTCH-1156 building errors with gora-hbase as a backend; update ivy.xml to use correct dependancies (Ferdy via lewismc) + +* NUTCH-1109 Add Sonar targets to Ant build.xml (lewismc) + +* NUTCH-1097 application/xhtml+xml should be enabled in plugin.xml of parse-html; allow multiple mimetypes for plugin.xml (Ferdy via lewismc) + +* Change plugin source directory "languageidentifier" to "language-identifier" (lewismc) + +* NUTCH-1132, 1133 & 1134 Fix TestGenerator, TestInjector & TestFetcher respectively (lewismc) + +* NUTCH-1154 Upgrade to Tika 0.10. NOTE: Tika's new RTF parser may ignore more + text in malformed documents than previously - see TIKA-748 for details. (ab) + +* NUTCH-1152 Upgrade SolrJ to version 3.4.0 (ab) + +* NUTCH-1136 Ant pmd target is broken + +* NUTCH-1058 Upgrade Solr schema version to 1.4 (markus) + +* NUTCH-672 allow unit tests to be run from bin/nutch (Todd Lipton via lewismc) + +* NUTCH-937 Put plugins in classes/plugins in job file (Claudio Martella, Ferdy Galema, jnioche) + +* NUTCH-1131 Rely on published artefacts for GORA (jnioche) + +* NUTCH-1099 Adds HBase and Cassandra storage properties to nutch-default.xml (lewismc) + +* NUTCH-1096 Empty (not null) ContentLength results in failure of fetch (Ferdy Galema via jnioche) + +* NUTCH-1089 Short compressed pages caused exception in protocol-httpclient (Simone Frenzel via jnioche) + +* NUTCH-1085 Nutch script does not require HADOOP_HOME (jnioche) + +* NUTCH-1083 ParserChecker implements Tools (jnioche) + +* NUTCH-1004 Do not index empty values for title field (markus) + +* NUTCH-914 Implement Apache Project Branding Requirements (lewismc via jnioche) + +* NUTCH-1065 New mvn.template (lewismc) + +* NUTCH-1045 MimeUtil to rely on default config provided by Tika (jnioche) + +* NUTCH-1037 Option to deduplicate anchors prior to indexing (markus) + +* NUTCH-1055 upgrade package.html file in language identifier plugin (lewismc) + +* NUTCH-1043 Add pattern for filtering .js in default url filters (jnioche) + +* NUTCH-1027 Degrade log level of `can't find rules for scope` (markus) + +* NUTCH-1011 Normalize duplicate slashes in URL's (markus) + +* NUTCH-1013 Migrate RegexURLNormalizer from Apache ORO to java.util.regex (markus) + +* NUTCH-1016 Strip UTF-8 non-character codepoints and add logging for SolrWriter (markus) + +* NUTCH-1012 Cannot handle illegal charset $charset (markus) + +* NUTCH-295 Description for fetcher.threads.fetch property (kubes via markus) + +* NUTCH-1006 MetaEquiv with single quotes not accepted (markus) + +* NUTCH-1010 ContentLength not trimmed (markus) + +* NUTCH-995 Generate POM file using the Ivy makepom task (mattmann, jnioche, Gabriele Kahlout) + +* NUTCH-1003 task 'package' does not reflect the new organisation of the code (jnioche) + +* NUTCH-994 Fine tune Solr schema (markus) + +* NUTCH-999 Normalise String representation for Dates in IndexingFilters (jnioche) + +* NUTCH-996 Indexer adds solr.commit.size+1 docs (markus) + +* NUTCH-983 Upgrade SolrJ to 3.1 (markus, jnioche) + +* NUTCH-989 Index-basic plugin and Solr schema now use date fieldType for tstamp field (markus) + +* NUTCH-888 Remove parse-rss and add tests for rss to parse-tika (jnioche) + +* NUTCH-991 SolrDedup must issue a commit (markus) + +* NUTCH 986 SolrDedup fails due to date incorrect format (markus) + +* NUTCH-977 SolrMappingReader uses hardcoded configuration parameter name for mapping file (markus) + +* NUTCH-976 Rename properties solrindex.* to solr.* (markus) + +* NUTCH-975 Fix missing/wrong headers in source files (markus, jnioche) + +* NUTCH-980 Fix IllegalAccessError with slf4j used in Solrj (markus) + +* NUTCH-982 Remove copying of ID and URL field in solrmapping (markus) + +* NUTCH-891 Subcollection plugin won't require blacklist any more (markus) + +* NUTCH-967 Upgrade to Tika 0.9 (jnioche) + +* NUTCH-955 Ivy configuration improvements. Upgrade to Xerces 2.9.1 and Restlet 2.0.5 (alexis via ab) + +* NUTCH-962 max. redirects not handled correctly: fetcher stops at max-1 redirects (Sebastian Nagel via ab) + +* NUTCH-964 Upgraded Xerces to 2.91 (markus) + +* NUTCH-824 Crawling - File Error 404 when fetching file with an hexadecimal character in the file name (Michela Becchi via jnioche) + +* NUTCH-954 Strict application of Content-Length limit for http protocols (Alexis Detreglode via jnioche) + +* NUTCH-953 Fixed crawl command in Nutch script (Alexis Detreglode via jnioche) + +* NUTCH-950 DomainURLFilter throws NPE on bogus urls (Alexis Detreglode via jnioche) + +* NUTCH-935 basicurlnormalizer removes unnecessary /./ in URLs (Stondet via markus) + +* NUTCH-912 MoreIndexingFilter does not parse docx and xlsx date formats (Markus Jelsma, jnioche) + +* NUTCH-936 LanguageIdentifier should not set empty lang field on NutchDocument (Markus Jelsma via jnioche) + +* NUTCH-949 Conflicting ANT jars in classpath (jnioche) + +* NUTCH-825 Publish nutch artifacts to central maven repository (mattmann) + +* NUTCH-913 Nutch should use new namespace for Gora (dogacan) + +* NUTCH-714 Need a SFTP and SCP Protocol Handler (Sanjoy Ghosh, mattmann) + +* NUTCH-894 Move statistical language identification from indexing to parsing step + (Sertan Alkan via dogacan) + +* NUTCH-901 Make index-more plug-in configurable (Markus Jelsma via mattmann) + +* NUTCH-862 HttpClient null pointer exception (Sebastian Nagel via ab) + +* NUTCH-904 "-resume" option is always processed as "false" in FetcherJob + (Faruk Berksöz via dogacan) + +* NUTCH-905 Configurable file protocol parent directory crawling (Thorsten Scherler, mattmann, ab) + +* NUTCH-716 Make subcollection index filed multivalued (Dmitry Lihachev via jnioche) + +* NUTCH-884 FetcherJob should run more reduce tasks than default (ab) + +* NUTCH-883 Remove unused parameters from nutch-default.xml (jnioche) + +* NUTCH-886 A .gitignore file for Nutch (dogacan) + +* NUTCH-872 Change the default fetcher.parse to FALSE (ab). + +* NUTCH-861 Renamed HTMLParseFilter into ParseFilter + +* NUTCH-876 Remove remaining robots/IP blocking code in lib-http (ab) + +* NUTCH-851 Port logging to slf4j (jnioche) + +* NUTCH-564 External parser supports encoding attribute (Antony Bowesman, mattmann) + +* NUTCH-873 Ivy configuration settings don't include Gora (mattmann) + +* NUTCH-870 Injector should add the metadata before calling injectedScore (jnioche via mattmann) + +* NUTCH-867 Port Nutch benchmark to Nutchbase (ab) + +* NUTCH-869 Add parse-html back (jnioche) + +* NUTCH-871 MoreIndexingFilter missing date format (Max Lynch via mattmann) + +* NUTCH-696 Timeout for Parser (ab, jnioche) + +* NUTCH-774 Retry interval in crawl date is set to 0 (Reinhard Schwab via mattmann) + +* NUTCH-697 Generate log output for solr indexer and dedup (Dmitry Lihachev, Jeroen van Vianen via mattmann) + +* NUTCH-844 Improve NutchConfiguration (ab) + +* NUTCH-850 SolrDeleteDuplicates needs to clone the SolrRecord objects (jnioche) + +* NUTCH-845 Native hadoop libs not available through maven (ab) + +* NUTCH-843 Separate the build and runtime environments (ab) + +* NUTCH-821 Use ivy in nutch builds (Enis Soztutar, jnioche) + +* NUTCH-838 Add timing information to all Tool classes (Jeroen van Vianen, mattmann) + +* NUTCH-837 Remove search servers and Lucene dependencies (ab) + +* NUTCH-836 Remove deprecated parse plugins (jnioche) + +* NUTCH-835 Document deduplication failed using MD5Signature (Sebastian Nagel via ab) + +* NUTCH-278 Fetcher-status might need clarification: kbit/s instead of kb/s shown (Alex McLintock via mattmann) + +* NUTCH-833 Website is still Lucene branded (mattmann, Alex McLintock) + +* NUTCH-832 Website menu has lots of broken links - in particular the API docs (Alex McLintock via mattmann) + +* NUTCH-921 Reduce dependency of Nutch on config files (ab) + +* NUTCH-907 DataStore API doesn't support multiple storage areas for multiple disjoint crawls (Sertan Alkan via ab) + +* NUTCH-880 REST API for Nutch (ab) + +* NUTCH-930 Remove remaining dependencies on Lucene API (ab) + +* NUTCH-931 Simple admin API to fetch status and stop the service (ab) + +* NUTCH-932 Bulk REST API to retrieve crawl results as JSON (ab) + + +Release 1.1 - 2010-06-06 + +* NUTCH-819 Included Solr schema.xml and solrindex-mapping.xml don't play together (ab) + +* NUTCH-818 Bugfix : Parse-tika uses minorCodes instead of majorCodes in ParseStatus (jnioche) + +* NUTCH-816 Add zip target to build.xml (mattmann) + +* NUTCH-732 Subcollection plugin not working (Filipe Antunes, ab) + +* NUTCH-815 Invalid blank line before If-Modified-Since header (Pascal Dimassimo via ab) + +* NUTCH-814 SegmentMerger bug (Rob Bradshaw, ab) + +* NUTCH-812 Crawl.java incorrectly uses the Generator API resulting in NPE (Phil Barnett via mattmann and ab) + +* NUTCH-810 Upgrade to Tika 0.7 (jnioche) + +* NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call scfilters.initialScore on newly created URL (jnioche) + +* NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) + +* NUTCH-784 CrawlDBScanner (jnioche) + +* NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche) + +* NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) + +* NUTCH-803 Upgrade to Hadoop 0.20.2 (ab) + +* NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab) + +* NUTCH-796 Zero results problems difficult to troubleshoot due to lack of logging (ab) + +* NUTCH-801 Remove RTF and MP3 parse plugins (jnioche) + +* NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche) + +* NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche) + +* NUTCH-782 Ability to order htmlparsefilters (jnioche) + +* NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via jnioche) + +* NUTCH-790 Some external javadoc links are broken (siren) + +* NUTCH-766 Tika parser (jnioche via mattmann) + +* NUTCH-786 Improvement to the list of suffix domains (jnioche) + +* NUTCH-775 Enhance searcher interface (siren) + +* NUTCH-781 Update Tika to v0.6 (jnioche) + +* NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count (stack + jnioche) + +* NUTCH-655 Injecting Crawl metadata (jnioche) + +* NUTCH-658 Use counters to report fetching and parsing status (jnioche) + +* NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann) + +* NUTCH-767 Update Tika to v0.5 for the MimeType detection (Julien Nioche via ab) + +* NUTCH-769 Fetcher to skip queues for URLS getting repeated exceptions + (Julien Nioche via ab) + +* NUTCH-768 - Upgrade Nutch 1.0 to use Hadoop 0.20.1, also upgrades Xerces to + version 2.9.1. (kubes) + +* NUTCH-712 ParseOutputFormat should catch java.net.MalformedURLException + coming from normalizers (Julien Nioche via ab) + +* NUTCH-741 Job file includes multiple copies of nutch config files + (Kirby Bohling via ab) + +* NUTCH-739 SolrDeleteDuplications too slow when using hadoop (Dmitry Lihachev via ab) + +* NUTCH-738 Close SegmentUpdater when FetchedSegments is closed + (Martina Koch, Kirby Bohling via ab) + +* NUTCH-746 NutchBeanConstructor does not close NutchBean upon contextDestroyed, + causing resource leak in the container. (Kirby Bohling via ab) + +* NUTCH-772 Upgrade Nutch to use Lucene 2.9.1 (ab) + +* NUTCH-760 Allow field mapping from Nutch to Solr index (David Stuart, ab) + +* NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab) + +* NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab) + +* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab via ab) + +* NUTCH-765 - Allow Crawl class to call Either Solr or Lucene Indexer (kubes) + +* NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when + invoked using crawl command (Susam Pal via dogacan) + +* NUTCH-721 - Fetcher2 Slow (Julien Nioche via dogacan) + +* NUTCH-702 - Lazy Instanciation of Metadata in CrawlDatum (Julien Nioche via dogacan) + +* NUTCH-707 - Generation of multiple segments in multiple runs returns only 1 segment + (Michael Chen, ab) + +* NUTCH-730 - NPE in LinkRank if no nodes with which to create the WebGraph + (Dennis Kubes via ab) + +* NUTCH-731 - Redirection of robots.txt in RobotRulesParser (Julien Nioche via ab) + +* NUTCH-757 - RequestUtils getBooleanParameter() always returns false + (Niall Pemberton via ab) + +* NUTCH-754 - Use GenericOptionsParser instead of FileSystem.parseArgs() (Julien + Nioche via ab) + +* NUTCH-756 - CrawlDatum.set() does not reset Metadata if it is null (Julien Nioche + via ab) + +* NUTCH-679 - Fetcher2 implementing Tool (Julien Nioche via ab) + +* NUTCH-758 - Set subversion eol-style to "native" (Niall Pemberton via ab) + +Release 1.0 - 2009-03-23 + + 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) + + 2. NUTCH-443 - Allow parsers to return multiple Parse objects. + (Dogacan Guney et al, via ab) + + 3. NUTCH-393 - Indexer should handle null documents returned by filters. + (Eelco Lempsink via ab) + + 4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren) + + 5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other + bots in robots.txt (Dogacan Guney via siren) + + 6. NUTCH-482 - Remove redundant plugin lib-log4j (siren) + + 7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin + (siren) + + 8. NUTCH-161 - Change Plain text parser to + use parser.character.encoding.default property for fall back encoding + (KuroSaka TeruHiko, siren) + + 9. NUTCH-61 - Support for adaptive re-fetch interval and detection of + unmodified content. (ab) + +10. NUTCH-392 - OutputFormat implementations should pass on Progressable. + (cutting via ab) + +11. NUTCH-495 - Unnecessary delays in Fetcher2 (dogacan) + +12. NUTCH-443 - allow parsers to return multiple Parse object, this will speed + up the rss parser (dogacan via mattmann). This update is a fix and semantics + change from the original patch for NUTCH-443. The original patch did not tell + the Indexer to read crawl_parse too so that it can pickup sub-urls' fetch + datums. This patch addresses that issue. Now, if Fetcher gets a null content, + instead of pushing an empty content, it filters the null content. + +13. NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead of + Parse object. (Gal Nitzan via dogacan) + +14. NUTCH-489 - URLFilter-suffix management of the url path when the url contains + some query parameters. (Emmanuel Joke via dogacan) + +15. NUTCH-502 - Bug in SegmentReader causes infinite loop. + (Ilya Vishnevsky via dogacan) + +16. NUTCH-444 Possibly use a different library to parse RSS feed for improved + performance and compatibility. This patch introduced a new plugin, feed, + that includes an index filter and a parse plugin for feeds that uses ROME. + There was discussion to remove parse-rss, in light of the feed plugin, + however, this patch does not explicitly remove parse-rss. (dogacan, mattmann) + +17. NUTCH-471 - Fix synchronization in NutchBean creation. + (Enis Soztutar via dogacan) + +18. Upgrade to Lucene 2.2.0 and Hadoop 0.12.3. (ab) + +19. NUTCH-468 - Scoring filter should distribute score to all outlinks at + once. (dogacan) + +20. NUTCH-504 - NUTCH-443 broke parsing during fetching. (dogacan) + +21. NUTCH-497 - Extreme Nested Tags causes StackOverflowException in + DomContentUtils...Spider Trap. (kubes) + +22. NUTCH-434 - Replace usage of ObjectWritable with something based on + GenericWritable. (dogacan) + +23. NUTCH-499 - Refactor LinkDb and LinkDbMerger to reuse code. (dogacan) + +24. NUTCH-498 - Use Combiner in LinkDb to increase speed of linkdb generation. + (Espen Amble Kolstad via dogacan) + +25. NUTCH-507 - lib-lucene-analyzers jar defintion is wrong in plugin.xml. + (Emmanuel Joke via dogacan) + +26. NUTCH-503 - Generator exits incorrectly for small fetchlists. + (Vishal Shah via dogacan) + +27. NUTCH-505 - Outlink urls should be validated. (dogacan) + +28. NUTCH-510 - IndexMerger delete working dir. (Enis Soztutar via dogacan) + +29. NUTCH-513 - suffix-urlfilter.txt does not have a template. (dogacan) + +30. NUTCH-515 - Next fetch time is set incorrectly. (dogacan) + +30. NUTCH-506 - Nutch should delegate compression to Hadoop. (dogacan) + +31. NUTCH-517 - build encoding should be UTF-8. (Enis Soztutar via dogacan). + +32. NUTCH-518 - Fix OpicScoringFilter to respect scoring filter chaining. + (Enis Soztutar via dogacan) + +33. NUTCH-516 - Next fetch time is not set when it is a + CrawlDatum.STATUS_FETCH_GONE. (Emmanuel Joke via dogacan) + +34. NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException + when trying to rerun dedup on a segment. (Vishal Shah via dogacan) + +35. NUTCH-514 - Indexer should only index pages with fetch status SUCCESS. + (dogacan) Note: There is a bigger problem, i.e how to deal + with redirected pages, and this issue can be considered as a band-aid + for the time being. See NUTCH-273 and NUTCH-353 for more details. + +36. NUTCH-533 - LinkDbMerger: url normalized is not updated in the key and + inlinks list. (Emmanuel Joke via dogacan) + +37. NUTCH-535 -ParseData's contentMeta accumulates unnecessary values during + parse. (dogacan) + +38. NUTCH-522 - Use URLValidator in the Injector. (Emmanuel Joke, dogacan) + +39. NUTCH-536 - Reduce number of warnings in nutch core. (dogacan) + +40. NUTCH-439 - Top Level Domains Indexing / Scoring. Also adds + domain-related utilities. (Enis Soztutar via dogacan) + +41. NUTCH-544 - Upgrade Carrot2 clustering plugin to the newest stable + release (2.1). (Dawid Weiss via dogacan) + +42. NUTCH-545 - Configuration and OnlineClusterer get initialized in every + request. (Dawid Weiss via dogacan) + +43. NUTCH-532 - CrawlDbMerger: wrong computation of last fetch time. + (Emmanuel Joke via dogacan) + +44. NUTCH-550 - Parse fails if db.max.outlinks.per.page is -1. (dogacan) + +45. NUTCH-546 - file URL are filtered out by the crawler. (dogacan) + +46. NUTCH-554 - Generator throws IOException on invalid urls. + (Brian Whitman via ab) + +47. NUTCH-529 - NodeWalker.skipChildren doesn't work for more than 1 child. + (Emmanuel Joke via dogacan) + +48. NUTCH-25 - needs 'character encoding' detector. + (Doug Cook, dogacan, Marcin Okraszewski, Renaud Richardet via dogacan) + +49. NUTCH-508 - ${hadoop.log.dir} and ${hadoop.log.file} are not propagated + to the tasktracker. (Mathijs Homminga, Emmanuel Joke via dogacan) + +50. NUTCH-562 - Port mime type framework to use Tika mime detection framework. + (mattmann) + +51. NUTCH-488 - Avoid parsing uneccessary links and get a more relevant outlink + list. (Emmanuel Joke, Marcin Okraszewski via kubes) + +52. NUTCH-501 - Implement a different caching mechanism for objects cached in + configuration. (dogacan) + +53. NUTCH-552 - Upgrade Nutch to Hadoop 0.15.x. (kubes) + +54. NUTCH-565 - Arc File to Nutch Segments Converter. (kubes) + +55. NUTCH-547 - Redirection handling: YahooSlurp's algorithm. + (dogacan, kubes via dogacan) + +56. NUTCH-548 - Move URLNormalizer from Outlink to ParseOutputFormat. + (Emmanuel Joke via dogacan) + +57. NUTCH-538 - Delete unused classes under o.a.n.util. (dogacan) + +58. NUTCH-494 - FindBugs: CrawlDbReader and DeleteDuplicates. (dogacan) + +59. NUTCH-574 - Including inlink anchor text in index can create irrelevant + search results. Created index-anchor plugin, removed functionality from + index-basic plugin. For backwards compatibility, add index-anchor plugin to + nutch-site.xml plugin.includes. (kubes) + +60. NUTCH-581 - DistributedSearch does not update search servers added to + search-servers.txt on the fly. (Rohan Mehta via kubes) + +61. NUTCH-586 - Add option to run compiled classes without job file + (enis via ab) + +62. NUTCH-559 - NTLM, Basic and Digest Authentication schemes for web/proxy + server. (Susam Pal via dogacan) + +63. NUTCH-534 - SegmentMerger: add -normalize option (Emmanuel Joke via ab) + +64. NUTCH-528 - CrawlDbReader: add some new stats + dump into a CSV format + (Emmanuel Joke via ab) + +65. NUTCH-597 - NPE in Fetcher2 (Remco Verhoef via ab) + +66. NUTCH-584 - urls missing from fetchlist (Ruslan Ermilov, ab) + +67. NUTCH-580 - Remove deprecated hadoop api calls (FS) (siren) + +68. NUTCH-587 - Upgrade to Hadoop 0.15.3 (kubes) + +69. NUTCH-604 - Upgrade to Lucene 2.3.0 (ab) + +70. NUTCH-602 - Allow configurable number of handlers for search servers + (hartbecke via kubes) + +71. NUTCH-607 - Update build.xml to include tika jar when building war (kubes) + +72. NUTCH-608 - Upgrade nutch to use released apache-tika-0.1-incubating (mattmann) + +73. NUTCH-606 - Refactoring of Generator, run all urls through checks (kubes) + +74. NUTCH-605 - Change deprecated configuration methods for Hadoop (kubes) + +75. NUTCH-603 - Add more default url normalizations (kubes) + +76. NUTCH-611 - Upgrade Nutch to use Hadoop 0.16 (kubes) + +77. NUTCH-44 - Too many search results, limits max results returned from a + single search. (Emilijan Mirceski and Susam Pal via kubes) + +78. NUTCH-567 - Proper (?) handling of URIs in TagSoup. TagSoup library is + updated to 1.2 version. (dogacan) + +79. NUTCH-613 - Empty summaries and cached pages (kubes via ab) + +80. NUTCH-612 - URL filtering was disabled in Generator when invoked + from Crawl (Susam Pal via ab) + +81. NUTCH-601 - Recrawling on existing crawl directory (Susam Pal via ab) + +82. NUTCH-575 - NPE in OpenSearchServlet (John H. Lee via ab) + +83. NUTCH-126 - Fetching https does not work with a proxy (Fritz Elfert via ab) + +84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval. + Guard against reprUrl being null. (Emmanuel Joke, ab) + +85. NUTCH-616 - Reset Fetch Retry counter when fetch is successful (Emmanuel + Joke, ab) + +86. NUTCH-220 - Upgrade to PDFBox 0.7.3 (ab) + +87. NUTCH-223 - Crawl.java uses Integer.MAX_VALUE (Jeff Ritchie via ab) + +88. NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API. + (Emmanuel Joke, dogacan, ab) + +89. NUTCH-620 - BasicURLNormalizer should collapse runs of slashes with a + single slash. (Mark DeSpain via ab) + +90. NUTCH-500 - Add hadoop masters configuration file into conf folder. + (Emmanuel Joke via kubes) + +91. NUTCH-596 - ParseSegments parse content even if its not + CrawlDatum.STATUS_FETCH_SUCCESS (dogacan) + +92. NUTCH-618 - Tika error "Media type alias already exists" (mattmann,kubes) + +93. NUTCH-634 - Upgrade Nutch to Hadoop 0.17.1 (Michael Gottesman, Lincoln + Ritter, ab) + +94. NUTCH-641 - IndexSorter inorrectly copies stored fields (ab) + +95. NUTCH-645 - Parse-swf unit test failing (ab) + +96. NUTCH-642 - Unit tests fail when run in non-local mode (ab) + +97. NUTCH-639 - Change LuceneDocumentWrapper visibility from + private to _public_ (Guillaume Smet via dogacan) + +98. NUTCH-651 - Remove bin/{start|stop}-balancer.sh from svn + tracking. (dogacan) + +99. NUTCH-375 - Add support for Content-Encoding: deflated + (Pascal Beis, ab) + +100. NUTCH-633 - ParseSegment no longer allow reparsing. + (dogacan) + +101. NUTCH-653 - Upgrade to hadoop 0.18. (dogacan) + +102. NUTCH-621 - Nutch needs to declare it's crypto usage (mattmann) + +103. NUTCH-654 - urlfilter-regex's main does not work. + (dogacan) + +104. NUTCH-640 - confusing description "set it to Integer.MAX_VALUE". + (dogacan) + +105. NUTCH-662 - Upgrade Nutch to use Lucene 2.4. (kubes) + +106. NUTCH-663 - Upgrade Nutch to use Hadoop 0.19 (kubes) + +107. NUTCH-647 - Resolve URLs tool (kubes) + +108. NUTCH-665 - Search Load Testing Tool (kubes) + +109. NUTCH-667 - Input Format for working with Content in Hadoop Streaming + (kubes) + +110. NUTCH-635 - LinkAnalysis Tool for Nutch. (kubes) + +111. NUTCH-646 - New Indexing Framework for Nutch. (kubes) + +112. NUTCH-668 - Domain URL Filter. (kubes) + +113. NUTCH-594 - Serve Nutch search results in multiple formats including + XML and JSON. (kubes) + +114. NUTCH-442 - Integrate Solr/Nutch. (dogacan, original version by siren) + +115. NUTCH-652 - AdaptiveFetchSchedule#setFetchSchedule doesn't calculate + fetch interval correctly. (dogacan) + +116. NUTCH-627 - Minimize host address lookup (Otis Gospodnetic) + +117. NUTCH-678 - Hadoop 0.19 requires an update of jets3t. + (julien nioche via dogacan) + +118. NUTCH-681 - parse-mp3 compilation problem. + (Wildan Maulana via dogacan) + +119. NUTCH-676 - MapWritable is written inefficiently and confusingly. + (dogacan) + +120. NUTCH-579 - Feed plugin only indexes one post per feed due to identical + digest. (dogacan) + +121. NUTCH-571 - parse-mp3 plugin doesn't always index album of mp3. + (Joseph Chen, dogacan) + +122. NUTCH-682 - SOLR indexer does not set boost on the document. + (julien nioche via dogacan) + +123. NUTCH-279 - Additions to urlnormalizer-regex (Stefan Neufeind, ab) + +124. NUTCH-671 - JSP errors in Nutch searcher webapp (Edwin Chu via ab) + +125. NUTCH-643 - ClassCastException in PDF parser (Guillaume Smet, ab) + +126. NUTCH-636 - Httpclient plugin https doesn't work on IBM JRE + (Curtis d'Entremont, ab) + +127. NUTCH-683 - NUTCH-676 broke CrawlDbMerger. (dogacan) + +128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException + (Stefan Will, siren) + +129. NUTCH-691 - Update jakarta poi jars to the most relevant version + (Dmitry Lihachev via siren) + +130. NUTCH-563 - Include custom fields in BasicQueryFilter + (Julien Nioche via siren) + +131. NUTCH-695 - Incorrect mime type detection by MoreIndexingFilter plugin + (Dmitry Lihachev via siren) + +132. NUTCH-694 - Distributed Search Server fails (siren) + +133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links + set at cross domain redirects (Remco Verhoef, dogacan via siren) + +134. NUTCH-247 - Robot parser to restrict (kubes, siren) + +135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan + via siren) + +136. NUTCH-699 - Add an "official" solr schema for solr integration (dogacan, + Dmitry Lihachev via siren) + +137. NUTCH-703 - Upgrade to Hadoop 0.19.1 (ab) + +138. NUTCH-419 - Unavailable robots.txt kills fetch (Carsten Lehmann, + Doug Cook via ab) + +139. NUTCH-700 - Neko1.9.11 goes into a loop (Julien Nioche, siren) + +140. NUTCH-669 - Consolidate code for Fetcher and Fetcher2 (siren) + +141. NUTCH-711 - Indexer failing after upgrade to Hadoop 0.19.1 (ab) + +142. NUTCH-684 - Dedup support for Solr. (dogacan) + +143. NUTCH-715 - Subcollection plugin doesn't work with default + subcollections.xml file (Dmitry Lihachev via siren) + +144. NUTCH-722 - Nutch contains JAI jars that we cannot redistribute + +Release 0.9 - 2007-04-02 + + 1. Changed log4j confiquration to log to stdout on commandline + tools (siren) + + 2. NUTCH-344 - Fix for thread blocking issue (Greg Kim via siren) + + 3. NUTCH-260 - Update hadoop version to 0.5.0 (Renaud Richardet, + siren) + + 4. Optionally skip pages with abnormally large values of Crawl-Delay + (Dennis Kubes via ab) + + 5. Change readdb -stats to use CombiningCollector (ab) + + 6. NUTCH-348 - Fix Generator to select highest scoring pages (Chris + Schneider and Stefan Groschupf via ab) + + 7. NUTCH-347 - Adjust plugin build script not to emit warnings when copying + dependant jars (siren) + + 8. NUTCH-338 - Remove the text parser as an option for parsing PDF files + in parse-plugins.xml (Chris A. Mattmann via siren) + + 9. NUTCH-105 - Network error during robots.txt fetch causes file to + be ignored (Greg Kim via siren) + +10. NUTCH-367 - DistributedSearch thown ClassCastException (siren) + +11. NUTCH-332 - Fix the problem of doubling scores caused by links pointing + to the current page (e.g. anchors). (Stefan Groschupf via ab) + +12. NUTCH-365 - Flexible URL normalization (ab) + +13. NUTCH-336 - Differentiate between newly discovered pages and newly + injected pages (Chris Schneider via ab) NOTE: this changes the + scoring API, filter implementations need to be updated. + +14. NUTCH-337 - Fetcher ignores the fetcher.parse value (Stefan Groschupf + via ab) + +15. NUTCH-350 - Urls blocked by http.max.delays incorrectly marked as GONE + (Stefan Groschupf via ab) + +16. NUTCH-374 - when http.content.limit be set to -1 and + Response.CONTENT_ENCODING is gzip or x-gzip , it can not fetch any thing + (King Kong via pkosiorowski) + +17. NUTCH-383 - upgrade to Hadoop 0.7.1 and Lucene 2.0.0. (ab) + + ****************************** WARNING !!! ******************************** + * This upgrade breaks data format compatibility. A tool 'convertdb' * + * was added to migrate existing CrawlDb-s to the new format. Segment data * + * can be partially migrated using 'mergesegs', however segments will * + * require re-parsing (and consequently re-indexing). * + ****************************** WARNING !!! ******************************** + +18. NUTCH-371 - DeleteDuplicates now correctly implements both parts of + the algorithm. (ab) + +19. NUTCH-391 - ParseUtil logs file contents to log file when it cannot + find parser (siren) + +20. NUTCH-379 - ParseUtil does not pass through the content's URL to the + ParserFactory (Chris A. Mattmann via siren) + +21. NUTCH-361, NUTCH-136 - When jobtracker is 'local' generate only one + partition. (ab) + +22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren) + +23. NUTCH-395 - Increase fetching speed (siren) + +24. NUTCH-388 - nutch-default.xml has outdated example for urlfilter.order + (reported by Jared Dunne) + +25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren) + +26. NUTCH-403 - Make URL filtering optional in Generator (siren) + +27. NUTCH-405 - Content object is not properly initialized in map method + of ParseSegment (siren) + +28. NUTCH-362 - Remove parse-text from unsupported filetypes in + parse-plugins.xml (siren) + +29. NUTCH-305 - Update crawl and url filter lists to exclude + jpeg|JPEG|bmp|BMP, suffix-urlfilter.txt (contributed by Stefan + Neufeind) is also updated (siren) + +30. NUTCH-406 - Metadata tries to write null values (mattmann) + +31. NUTCH-415 - Generator should mark selected records in CrawlDb. + Due to increased resource consumption this step is optional. + Application-level locking has been added to prevent concurrent + modification of databases. (ab) + +32. NUTCH-416 - CrawlDatum status and CrawlDbReducer refactoring. It is + now possible to correctly update CrawlDb from multiple segments. + Introduce new status codes for temporary and permanent + redirection. (ab) + +33. NUTCH-322 - Fix Fetcher to store redirected pages and to store + protocol-level status. This also should fix NUTCH-273. (ab) + +34. Change default Fetcher behavior not to follow redirects immediately. + Instead Fetcher will record redirects as new pages to be added to CrawlDb. + This also partially addresses NUTCH-273. (ab) + +35. Detect and report when Generator creates 0-sized segments. (ab) + +36. Fix Injector to preserve already existing CrawlDatum if the seed list + being injected also contains such URL. (ab) + +37. NUTCH-425, NUTCH-426 - Fix anchors pollution. Continue after + skipping bad URLs. (Michael Stack via ab) + +38. NUTCH-325 - UrlFilters.java throws NPE in case urlfilter.order contains + Filters that are not in plugin.includes (Stefan Groschupf, siren) + +39. NUTCH-421 - Allow predeterminate running order of indexing filters + (Alan Tanaman, siren) + +40. When indexing pages with redirection, drop all intermediate pages and + index only the final page. (ab) + +41. Upgrade to Hadoop 0.10.1. (ab) + +42. NUTCH-420 - Fix a bug in DeleteDuplicates where results depended on the + order in which IndexDoc-s are processed. (Dogacan Guney via ab) + +43. NUTCH-428 - NullPointerException thrown when agent name is not + configured properly. Changed to throw RuntimeException instead. + (siren) + +44. NUTCH-430 - Integer overflow in HashComparator.compare (siren) + +45. NUTCH-68 - Add a tool to generate arbitrary fetchlists. (ab) + +46. NUTCH-433 - java.io.EOFException in newer nightlies in mergesegs + or indexing from hadoop.io.DataOutputBuffer (siren) + +47. NUTCH-339 - Fetcher2: a queue-based fetcher implementation. (ab) + +48. NUTCH-390 - Javadoc warnings (mattmann) + +49. NUTCH-449 - Make junit output format configurable. (nigel via cutting) + +50. NUTCH-432 - Fix a bug where platform name with spaces would break the + bin/nutch script. (Brian Whitman via ab) + +51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release. (ab) + +52. NUTCH-167 - Observation of robots "noarchive" directive. (ab) + +53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins + framework to operate properly (Heiko Dietze via mattmann) + +54. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan + Groschupf via kubes) + +55. NUTCH-436 - Incorrect handling of relative paths when the embedded URL + path is empty (kubes) + +56. Upgrade to Hadoop 0.12.1 release. (ab) + +57. NUTCH-246 - Incorrect segment size being generated due to time + synchronization issue (Stefan Groschupf via ab) + +58. Upgrade to Hadoop 0.12.2 release. (ab) + +59. NUTCH-333 - SegmentMerger and SegmentReader should use NutchJob. (Michael + Stack and Dogacan Guney via kubes) + +Release 0.8 - 2006-07-25 + + 0. Totally new architecture, based on hadoop + [http://lucene.apache.org/hadoop] (cutting) + + 1. NUTCH-107 - Typo in plugin/urlfilter-*/plugin.xml. (Stephen Cross). + + 2. NUTCH-108 - Log hosts that exceed generate.max.per.host. + (Rod Taylor via cutting) + + 3. NUTCH-88 - Enhance ParserFactory plugin selection policy + (jerome) + + 4. NUTCH-124 - Protocol-httpclient does not follow redirects when + fetching robots.txt (cutting) + + 5. NUTCH-130 - Be explicit about target JVM when building (1.4.x?) + (stack@archive.org, cutting) + + 6. NUTCH-114 - Getting number of urls and links from crawldb + (Stefan Groschupf via ab) + + 7. NUTCH-112 - Link in cached.jsp page to cached content is an + absolute link (Chris A. Mattmann via jerome) + + 8. NUTCH-135 - Http header meta data are case insensitive in the + real world (Stefan Groschupf via jerome) + + 9. NUTCH-145 - Build of war file fails on Chinese (zh) .xml files due + to UTF-8 BOM (KuroSaka TeruHiko via siren) + +10. NUTCH-121 - SegmentReader for mapred (Rod Taylor via ab) + +11. Added support for OpenSearch (cutting) + +12. NUTCH-142 - NutchConf should use the thread context classloader + (Mike Cannon-Brookes via pkosiorowski) + +13. NUTCH-160 - Use standard Java Regex library rather than + org.apache.oro.text.regex (Rod Taylor via cutting) + +14. NUTCH-151 - CommandRunner can hang after the main thread exec is + finished and has inefficient busy loop (Paul Baclace via cutting) + +15. NUTCH-174 - Problem encountered with ant during compilation + +16. NUTCH-190 - ParseUtil drops reason for failed parse + (stack@archive.org via ab) + +17. NUTCH-169 - Remove static NutchConf (Marko Bauhardt via ab) + +18. NUTCH-194 - Nutch-169 introduced two tiny bugs (Marko Bauhardt via ab) + +19. NUTCH-178 - in search.jsp must be session creation "false" + (YourSoft via siren) + +20. NUTCH-200 - OpenSearch Servlet ist broken + (Marko Bauhardt via siren) + +21. NUTCH-81 - Webapp only works when deployed in root + (AJ Banck, Michael Nebel via siren) + +22. NUTCH-139 - Standard metadata property names in the ParseData + metadata (Chris A. Mattmann, jerome) + +23. NUTCH-192 - Meta data support for CrawlDatum + (Stefan Groschupf via ab) + +24. NUTCH-52 - Parser plugin for MS Excel files + (Rohit Kulkarni via jerome) + +25. NUTCH-53 - Parser plugin for Zip files + (Rohit Kulkarni via jerome) + +26. NUTCH-137 - footer is not displayed in search result page + (KuroSaka TeruHiko via siren) + +27. NUTCH-118 - FAQ link points to invalid URL + (Steve Betts via siren) + +28. NUTCH-184 - Serbian (sr, Cyrilic) and Serbo-Croatian (sh, Latin) + translation (Ivan Sekulovic via siren) + +29. NUTCH-211 - FetchedSegments leave readers open (Stefan Groschupf + via cutting) + +30. NUTCH-140 - Add alias capability in parse-plugins.xml file that + allows mimeType->extensionId mapping (Chris A. Mattmann via jerome) + +31. NUTCH-214 - Added Links to web site to search mailling list + (Jake Vanderdray via jerome) + +32. NUTCH-204 - Multiple field values in HitDetails + (Stefan Groschupf via jerome) + +33. NUTCH-219 - file.content.limit & ftp.content.limit should be changed + to -1 to be consistent with http (jerome) + +34. NUTCH-221 - Prepare nutch for upcoming lucene 2.0 (siren) + +35. NUTCH-91 - Empty encoding causes exception (Michael Nebel via + pkosiorowski) + +36. NUTCH-228 - Clustering plugin descriptor broken (Dawid Weiss via + jerome) + +37. NUTCH-229 - Improved handling of plugin folder configuration + (Stefan Groschupf via ab) + +38. NUTCH-206 - Search server throws InstantiationException (ab) + +39. NUTCH-203 - ParseSegment throws InstantiationException (Marko Bauhardt + via ab) + +40. NUTCH-3 - Multi values of header discarded (Stefan Groschupf via ab) + +41. Update to lucene 1.9.1 (cutting) + +42. NUTCH-235 - Duplicate Inlink values (ab) + +43. NUTCH-234 - Clustering extension code cleanups and a real + JUnit test case for the current implementation (Dawid Weiss via ab) + +44. NUTCH-210 - Context.xml file for Nutch web application + (Chris A. Mattmann via jerome) + +45. NUTCH-231 - Invalid CSS entries (AJ Banck via jerome) + +46. NUTCH-232 - Search.jsp has multiple search forms creating + invalid html / incorrect focus function (jerome) + +47. NUTCH-196 - lib-xml and lib-log4j plugins (ab, jerome) + +48. NUTCH-244 - Inconsistent handling of property values + boundaries / unable to set db.max.outlinks.per.page to + infinite (jerome) + +49. NUTCH-245 - DTD for plugin.xml configuration files + (Chris A. Mattmann via jerome) + +50. NUTCH-250 - Generate to log truncation caused by + generate.max.per.host (Rod Taylor via cutting) + +51. NUTCH-125 - OpenOffice Parser plugin (ab) + +52. Switch from using java.io.File to org.apache.hadoop.fs.Path. + (cutting) + +53. NUTCH-240 - Scoring API: extension point, scoring filters and + an OPIC plugin (ab) + +54. NUTCH-134 - Summarizer doesn't select the best snippets (jerome) + +55. NUTCH-268 - Generator and lib-http use different definitions of + "unique host" (ab) + +56. NUTCH-280 - Url query causes NullPointerException (Grant Glouser + via siren) + +57. NUTCH-285 - LinkDb Fails rename doesn't create parent directories + (Dennis Kubes via ab) + +58. NUTCH-201 - Add support for subcollections + (siren) + +59. NUTCH-298 - If a 404 for a robots.txt is returned a NPE is thrown + (Stefan Groschupf via jerome) + +60. NUTCH-275 - Fetcher not parsing XHTML-pages at all (jerome) + +61. NUTCH-301 - CommonGrams loads analysis.common.terms.file for each query + (Stefan Groschupf via jerome) + +62. NUTCH-110 - OpenSearchServlet outputs illegal xml characters + (stack@archive.org via siren) + +63. NUTCH-292 - OpenSearchServlet: OutOfMemoryError: Java heap space + (Stefan Neufeind via siren) + +64. NUTCH-307 - Wrong configured log4j.properties (jerome) + +65. NUTCH-303 - Logging improvements (jerome) + +66. NUTCH-308 - Maximum search time limit (ab) + +67. NUTCH-306 - DistributedSearch.Client liveAddresses concurrency + problem (Grant Glouser via siren) + +68. Update to hadoop-0.4 (Milind Bhandarkar, cutting) + +69. NUTCH-317 - Clarify what the queryLanguage argument of + Query.parse(...) means (jerome) + +70. Added alternative experimental web gui in contrib containing + extensions like subcollection, keymatch, user preferences, + caching, implemented mainly using tiles and jstl (siren) + +71. NUTCH-320 DmozParser does not output list of urls to stdout + but to a log file instead. Original functionality restored. + +72. NUTCH-271 - Add ability to limit crawling to the set of initially + injected hosts (db.ignore.external.links) (Philippe Eugene, + Stefan Neufeind via ab) + +73. NUTCH-293 - Support for Crawl-Delay (Stefan Groschupf via ab) + +74. NUTCH-327 - Fixed logging directory on cygwin (siren) + +Release 0.7 - 2005-08-17 + + 1. Added support for "type:" in queries. Search results are limited/qualified + by mimetype or its primary type or sub type. For example, + (1) searching with "type:application/pdf" restricts results + to pages which were identified to be of mimetype "application/pdf". + (2) with "type:application", nutch will return pages of + primary type "application". + (3) with "type:pdf", only pages of sub type "pdf" will be listed. + (John Xing, 20050120) + + 2. Added support for "date:" in queries. Last-Modified is indexed. + Search results are restricted by lower and upper date (inclusive) + as date:yyyymmdd-yyyymmdd. For example, date:20040101-20041231 + only returns pages with Last-Modified in year 2004. + (John Xing, 20050122) + + 3. Add URLFilter plugin interface and convert existing url filters into + plugins. (John Xing, 20050206) + + 4. Add UpdateSegmentsFromDb tool, which updates the scores and + anchors of existing segments with the current values in the web + db. This is used by CrawlTool, so that pages are now only fetched + once per crawl. (Doug Cutting, 20050221) + + 5. Moved code into org.apache.nutch sub-packages. Changed license to + Apache 2.0. Removed jar files whose licenses do not permit + redistribution by Apache. Disabled compilation of plugins which + require these libraries. (Doug Cutting 20050301) + + 6. Index host and title in separate fields. Host was indexed + previously only as a part of the URL. Title was indexed as an + anchor. Now boosts for matching these fields may be adjusted + separately from boosts for matching anchors and url. Also: move + site indexing to index-basic plugin to minimize the number of + times the URL needs to be parsed; and, stop using anchor analyzer + for anything but anchors. (Piotr Kosiorowski via Doug Cutting + 20050323) + + 7. Add servlet Cached.java that serves cached Content of any mime type. + Slightly modified are web.xml and cached.jsp. + (John Xing, 20050401) + + 8. Add skipCompressedByteArray() to WritableUtils.java. + (John Xing, 20050402) + + 9. Fixes to jsp and static web pages. These now use relative links, + so that the Nutch webapp file can be used in places other than at + the root. Also fixed links to the about and help pages. Bug #32. + (Jerome Charron via cutting, 20050404) + +10. Added some features to DistributedSearch: new segments can be added + to searchservers without restarting the frontend, defective search + servers are not queried until tey come back online, watchdog keeps + an eye for your searchservers and writes simple statistics. + (Sami Siren, 20050407) + +11. Fix for bug #4 - Unbalanced quote in query eats all resources. + (Piotr Kosiorowski, Sami Siren, 20050407) + +12. Close Issue #33 - MIME content type detector (using magic char sequences). + (Jerome Charron and Hari Kodungallur via John Xing, 20050416) + +13. Add a servlet that implements A9's OpenSearch RSS web service. + (cutting, 20050418) + +14. Remove references to link analysis from tutorial, and enable + scoring by link count when generating fetchlists and searching. + (cutting, 20040419) + +15. Make query boosts for host, title, anchor and phrase matches + configurable. (Piotr Kosiorowski via cutting, 20050419) + +16. Add support for sorting search results and search-time deduping by + fields other than site. + +17. Automatically convert range queries into cached range filters. + This improves the performance and scalability of, e.g., date range + searching. + +18. Several methods have been renamed due to misspellings. The old + methods have been deprecated and will be removed before the 1.0 + release. + + +Release 0.6 + + 1. Added clustering-carrot2 plugin, together with introduction of clustering + api and modification to search jsp. (Dawid Weiss via John Xing, 20040809) + + 2. Make a number of changes to NDFS (Nutch Distributed File System) + to fix bugs, add admin tools, etc. + + Also, modify all command line tools so you can indicate whether to + use NDFS or the local filesystem. If you indicate nothing, then + it defaults to the local fs. + + I've used this to do a 35m page crawl via NDFS, distributed over a + dozen machines. (Mike Cafarella) + + 3. Add support for BASE tags in HTML. Outlinks are now correctly + extracted when a BASE tag is present. (cutting) + + 4. Fix two bugs in result pagination. When the last hit on a page + was the last hit overall, the "next" button was sometimes shown + when the "show all" button should be shown instead. Also, in + certain cases, the "show all" button would be shown when the + "next" button should have been shown. (cutting) + + 5. Add config parameter "indexer.max.tokens" that determines the + maximum number of tokens indexed per field. (Andy Hedges via cutting) + + 6. Add parser for mp3 files. (Andy Hedges via cutting) + + 7. Add RegexUrlNormalizer. This is useful for things like stripping + out session IDs from URLs. To use it, add values for + urlnormalizer.class and urlnormalizer.regex.file to your + nutch-site.xml. The RegexUrlNormalizer class extends the + BasicUrlNormalizer, and does basic normalization as well. + (Luke Baker via cutting) + + 8. Added Swedish translation (Stefan Verzel via Sami Siren, 20040910) + + 9. Added Polish translation (Andrzej Bialecki, 20040911) + +10. Added 3 more language profiles to language identifier (ru,hu,pl). + Other changes to language identifier: Porfiles converted to utf8, + added some test cases, changed the similarity calculation. + (Sami Siren, 20040925) + +11. Added plugin parse-rtf (Andy Hedges via John Xing, 20040929) + +12. Added plugin index-more and more.jsp (John Xing, 20041003) + +13. Added "View as Plain Text" feature. A new op OP_PARSETEXT is introduced + in DistributedSearch.java. text.jsp is added. (John Xing, 20041006) + +14. Fixed a bug that fails cached.jsp, explain.jsp, anchors.jsp and text.jsp + (but not search.jsp) with NullPointerException in distributed search. + It seems that this bug appears after "hits per site" stuff is added. + The fix is done in Hit.java, making sure String site is never null. + Hope this fix not have bad effetct on "hits per site" code. + (John Xing, 20041006) + +15. Fixed a bug that fails fullyDelete() in FileUtil.java for + LocalFileSystem.java. This bug also exposes possible incompleteness + of NDFSFile.java, where a few methods are not supported, including + delete(). Nothing changed in NDFSFile.java though. Leave it for future + improvement (John Xing, 20041022). + +16. Introduced option -noParsing to Fetcher.java and added ParseSegment.java. + A new status code CANT_PARSE is added to FetcherOutput.java. + Without option -noParsing , no change in fetcher behavior. With + option -noParsing, fetcher does crawls only, no parsing is carried out. + Then, ParseSegment.java should be used to parse in separate pass. + (John Xing, 20041025) + +17. Added ontology plugin. Currently it is used for query refinement, as + examplified in refine-query-init.jsp and refine-query.jsp. By default, + query refinement is disabled in search.jsp. Please check + ./src/plugin/ontology/README.txt for further description. + Ontology plugin certainly can be used for many other things. + (Michael J. Pan via John Xing, 20041129) + +18. Changed fetcher.server.delay to be a float, so that sub-second + delays can be specified. (cutting) + +19. Added plugin.includes config parameter that determines which + plugins are included. By default now only http, html and basic + indexing and search plugins are enabled, rather than all plugins. + This should make default performance more predictable and reliable + going forward. (cutting) + +20. Cleaned up some filesystem code, including: + + - Replaced BufferedRandomAccessFile with two simpler utilties, + NFSDataInputStream and NFSDataOutputStream. + + - Fixed the bug where SequenceFiles were no longer flushed when + created, so that, when fetches crashed, segments were + unreadable. Now segments are always readable after crashes. + Only the contents of the last buffer is lost. + + - Simplified the FSOutputStream API to not include seek(). We + should never need that functionality. + + - Simplified LocalFileSystem's implementations of FSInputStream + and FSOutputStream and optimized FSInputStream.seek(). + + (cutting) + +21. Fixed BasicUrlNormalizer to better handle relative urls. The file + part of a URL is normalized in the following manner: + + 1. "/aa/../" will be replaced by "/" This is done step by step until + the url doesn´t change anymore. So we ensure, that + "/aa/bb/../../" will be replaced by "/", too + + 2. leading "/../" will be replaced by "/" + + (Sven Wende via cutting) + +22. Fix Page constructors so that next fetch date is less likely to be + misconstrued as a float. This patches a problem in WebDBInjector, + where new pages were added to the db with nextScore set to the + intended nextFetch date. This, in turn, confused link analysis. + +23. In ndfs code, replace addLocalFile(), putToLocalFile() with + copyFromLocalFile(), moveFromLocalFile(), copyToLocalFile() and + moveToLocalFile(). (John Xing, 20041217) + +24. Added new config parameter fetcher.threads.per.host. This is used + by the Http protocol. When this is one behavior is as before. + When this is greater than one then multiple threads are permitted + to access a host at once. Note that fetcher.server.delay is no + longer consistently observed when this is greater than one. + (Luke Baker via Doug Cutting) + +Release 0.5 + + 1. Changed plugin directory to be a list of directories. + + 2. Permit Plugin to be the default plugin implementation. + + 3. Added pluggable interface for network protocols in new package + net.nutch.protocol. Moved http code from core into a plugin. + + 4. Added pluggable interface for content parsing in new package + net.nutch.parse. Moved html parsing code from core into a + plugin. + + 5. Fixed a bug in NutchAnalysis where 16-bit characters were not + processed correctly. + + 6. Fixed bug #971731: random summaries on result page. + (Daniel Naber via cutting) + + 7. Made Nutch logo transparent. (Daniel Naber via cutting) + + 8. Added file protocol plugin. (John Xing via cutting) + + 9. Added ftp protocol plugin. (John Xing via cutting) + +10. Added pdf and msword parser plugins. (John Xing via cutting) + +11. Added pluggable indexing interface. By default, url, content, + anchors and title are indexed, as before, but now one can easily + alter this to, e.g., index metadata. A demonstration is provided + which extracts and indexes Creative Commons license urls. (cutting) + +12. Add language identification plugin. + + The process of identification is as follows: + + 1. html (html only, HTML 4.0 "lang" attribute) + 2. meta tags (html only, http-equiv, dc.language) + 3. http header (Content-Language) + 4. if all above fail "statistical analysis" + + 1 & 2 are run during the fetching phase and 3 & 4 are run on + indexing phase. + + Currently supported languages (in "statistical analysis") are + da,de,el,en,es,fi,fr,it,nl,sv and pt. The corpus used was grabbed + from http://www.isi.edu/~koehn/europarl/ and the profiles were + build with tool supplied in patch. + + After indexing the language can be found from field named "lang" + + It's not 100% accurate but it's a start. + (Sami Siren) + +13. Added SegmentMergeTool and "mergesegs" command, to remove + duplicated or otherwise not used content from several segments and + joining them together into a single new segment. The tool also + optionally performs several other steps required for proper + operation of Nutch - such as indexing segments, deleting + duplicates, merging indices, and indexing the new single segment. + (Andrzej Bialecki) + +14. Add the ability to retrieve ParseData of a search hit. ParseData + contains many valuable properties of a search hit. + + This is required (among others) to properly display the cached + content because it's not possible to determine the character + encoding from the output of the getContent() method (which returns + byte[]). The symptoms are that for HTML pages using non-latin1 or + non-UTF8 encodings the cached preview will almost certainly look + broken. Using the attached patch it is possible to determine the + character encoding from the ParseData (for HTTP: Content-Type + metadata), and encode the content accordingly. (Andrzej Bialecki) + +15. Add a pluggable query interface. By default, the content, anchor + and url fields are searched as before. A sample plugin indexes + the host name and adds a "site:" keyword to query parsing. + +16. Added support for "lang:" in queries. For example, searching with + "lang:en" restricts results to pages which were identified to + be in English. + +17. Automatically optimize field queries to use cached Lucene filters. + This makes, for example, searches restricted by languages or sites + that are very common much faster. + +18. Improved charset handling in jsp pages. (jshin by cutting) + +19. Permit topic filtering when injecting DMOZ pages. (jshin by cutting) + +20. When parsing crawled pages, interpret charset specifications in + html meta tags. (jshin by cutting) + +21. Added support for "cc:licensed" in queries, which searches for documents + released under Creative Commons licenses. Attributes of the + license may also be queried, with, e.g., "cc:by" for + attribution-required licenses, "cc:nc" for non-commercial + licenses, etc. + +22. Relative paths named in plugin.folders are now searched for on the + classpath. This makes, e.g., deployment in a war file much simpler. + +23. Modifications to Fetcher.java. + + 1. Make sure it works properly with regard to creation and initialization + of plugin instances. The problem was that multiple threads race to + startUp() or shutDown() plugin instances. It was solved by synchronizing + certain codes in PluginRepository.java and Extension.java. + (Stefan Groschupf via John Xing) + + 2. Added code to explictly shutDown() plugins. Otherwise FetcherThreads + may never return (quit) if there are still data or other structures + (e.g., persistent socket connections) associated with plugins. (John Xing) + + 3. Fixed one type of Fetcher "hang" problems by monitoring named + FetcherThreads. If all FetcherThreads are gone (finished), + Fetcher.java is considered done. The problem was: there could be + runaway threads started by external libs via FetcherThreads. + Those threads never return, thus keep Fetcher from exiting normally. + (John Xing) + +24. Eliminate excessive hits from sites. This is done efficiently by + adding the site name to Hit instances, and, when needed, + re-querying with too-frequent sites prohibited in the query. + + +Release 0.4 + + 1. Http class refactored. (Kevin Smith via Tom Pierce) + + 2. Add Finnish translation. (Sampo Syreeni via Doug Cutting) + + 3. Added Japanese translation. (Yukio Andoh via Doug Cutting) + + 4. Updated Dutch translation. (Ype Kingma via Doug Cutting) + + 5. Initial version of Distributed DB code. (Mike Cafarella) + + 6. Make things more tolerant of crashed fetcher output files. + (Doug Cutting) + + 7. New skin for website. (Frank Henze via Doug Cutting) + + 8. Added Spanish translation. (Diego Basch via Doug Cutting) + + 9. Add FTP support to fetcher. (John Xing via Doug Cutting) + +10. Added Thai translation. (Pichai Ongvasith via Doug Cutting) + +11. Added Robots.txt & throttling support to Fetcher.java. (Mike + Cafarella) + +12. Added nightly build. (Doug Cutting) + +13. Default all link scores to 1.0. (Doug Cutting) + +14. Permit one to keep internal links. (Doug Cutting) + +15. Fixed dedup to select shortest URL. (Doug Cutting) + +16. Changed index merger so that merged index is written to named + directory, rather than to a generated name in that directory. + (Doug Cutting) + +17. Disable coordination weighting of query clauses and other minor + scoring improvements. (Doug Cutting) + +18. Added a new command, crawl, that constructs a database, injects a + url file and performs a few rounds of generate/fetch/updatedb. + This simplifies use for intranet sites. Changed some defaults to + be more intranet friendly. (Doug Cutting) + +19. Fixed a bug where Fetcher.java didn't construct correct relative + links when a page was redirected. (Doug Cutting) + +20. Fixed a query parser problem with lookahead over plusses and minuses. + (Doug Cutting) + +21. Add support for HTTP proxy servers. (Sami Siren via Doug Cutting) + +22. Permit searching while fetching and/or indexing. + (Sami Siren via Doug Cutting) + +23. Fix a bug when throttling is disabled. (Sami Siren via Doug Cutting) + +24. Updated Bahasa Malaysia translation. (Michael Lim via Doug Cutting) + +25. Added Catalan translation. (Xavier Guardiola via Doug Cutting) + +26. Added brazilian portuguese translation. + (A. Moreir via Doug Cutting) + +27. Added a french translation. (Julien Nioche via Doug Cutting) + +28. Updated to Lucene 1.4RC3. (Doug Cutting) + +29. Add capability to boost by link count & use it in crawl tool. + (Doug Cutting) + +30. Added plugin system. (Stefan Groschupf via Doug Cutting) + +31. Add this change log file, for recording significant changes to + Nutch. Populate it with changes from the last few months. diff --git a/apache-nutch-2.3/LICENSE.txt b/apache-nutch-2.3/LICENSE.txt new file mode 100644 index 0000000..1b7a967 --- /dev/null +++ b/apache-nutch-2.3/LICENSE.txt @@ -0,0 +1,5793 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +APACHE NUTCH SUBCOMPONENTS + +lib/commons-httpclient-3.0.1.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + +lib/icu4j-4_0_1.jar + +ICU license - ICU 1.8.1 and later + + COPYRIGHT AND PERMISSION NOTICE + + Copyright (c) 1995-2006 International Business Machines Corporation and + others + + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, provided that the above copyright notice(s) and this + permission notice appear in all copies of the Software and that both the + above copyright notice(s) and this permission notice appear in supporting + documentation. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY + RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS + NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL + DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR + PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF + THIS SOFTWARE. + + Except as contained in this notice, the name of a copyright holder shall + not be used in advertising or otherwise to promote the sale, use or other + dealings in this Software without prior written authorization of the + copyright holder. + + ---------------------------------------------------------------------- + + All trademarks and registered trademarks mentioned herein are the property + of their respective owners. + +lib/commons-collections-3.2.1.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +lib/commons-logging-1.0.4.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +lib/lucene-misc-2.4.0.jar +src/plugin/summary-lucene/lib/lucene-highlighter-2.4.0.jar +lib/lucene-core-2.4.0.jar +src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.4.0.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + +Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from unicode conversion examples available at +http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright +from those sources: + +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +Some code in src/java/org/apache/lucene/util/ArrayUtil.java was +derived from Python 2.4.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/2.4.2/license/ + +lib/jakarta-oro-2.0.8.jar + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2000-2002 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" + * must not be used to endorse or promote products derived from this + * software without prior written permission. For written + * permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache" + * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their + * name, without prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +lib/jetty-ext/commons-el.jar + +/* + * $Header: /home/cvs/jakarta-commons/el/LICENSE.txt,v 1.1.1.1 2003/02/04 00:22:24 luehe Exp $ + * $Revision: 1.1.1.1 $ + * $Date: 2003/02/04 00:22:24 $ + * + * ==================================================================== + * + * The Apache Software License, Version 1.1 + * + * Copyright (c) 1999-2002 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, if + * any, must include the following acknowlegement: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowlegement may appear in the software itself, + * if and wherever such third-party acknowlegements normally appear. + * + * 4. The names "The Jakarta Project", "Commons", and "Apache Software + * Foundation" must not be used to endorse or promote products derived + * from this software without prior written permission. For written + * permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache" + * nor may "Apache" appear in their names without prior written + * permission of the Apache Group. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +lib/jetty-ext/ant.jar + +/* + * Apache License + * Version 2.0, January 2004 + * http://www.apache.org/licenses/ + * + * TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + * + * 1. Definitions. + * + * "License" shall mean the terms and conditions for use, reproduction, + * and distribution as defined by Sections 1 through 9 of this document. + * + * "Licensor" shall mean the copyright owner or entity authorized by + * the copyright owner that is granting the License. + * + * "Legal Entity" shall mean the union of the acting entity and all + * other entities that control, are controlled by, or are under common + * control with that entity. For the purposes of this definition, + * "control" means (i) the power, direct or indirect, to cause the + * direction or management of such entity, whether by contract or + * otherwise, or (ii) ownership of fifty percent (50%) or more of the + * outstanding shares, or (iii) beneficial ownership of such entity. + * + * "You" (or "Your") shall mean an individual or Legal Entity + * exercising permissions granted by this License. + * + * "Source" form shall mean the preferred form for making modifications, + * including but not limited to software source code, documentation + * source, and configuration files. + * + * "Object" form shall mean any form resulting from mechanical + * transformation or translation of a Source form, including but + * not limited to compiled object code, generated documentation, + * and conversions to other media types. + * + * "Work" shall mean the work of authorship, whether in Source or + * Object form, made available under the License, as indicated by a + * copyright notice that is included in or attached to the work + * (an example is provided in the Appendix below). + * + * "Derivative Works" shall mean any work, whether in Source or Object + * form, that is based on (or derived from) the Work and for which the + * editorial revisions, annotations, elaborations, or other modifications + * represent, as a whole, an original work of authorship. For the purposes + * of this License, Derivative Works shall not include works that remain + * separable from, or merely link (or bind by name) to the interfaces of, + * the Work and Derivative Works thereof. + * + * "Contribution" shall mean any work of authorship, including + * the original version of the Work and any modifications or additions + * to that Work or Derivative Works thereof, that is intentionally + * submitted to Licensor for inclusion in the Work by the copyright owner + * or by an individual or Legal Entity authorized to submit on behalf of + * the copyright owner. For the purposes of this definition, "submitted" + * means any form of electronic, verbal, or written communication sent + * to the Licensor or its representatives, including but not limited to + * communication on electronic mailing lists, source code control systems, + * and issue tracking systems that are managed by, or on behalf of, the + * Licensor for the purpose of discussing and improving the Work, but + * excluding communication that is conspicuously marked or otherwise + * designated in writing by the copyright owner as "Not a Contribution." + * + * "Contributor" shall mean Licensor and any individual or Legal Entity + * on behalf of whom a Contribution has been received by Licensor and + * subsequently incorporated within the Work. + * + * 2. Grant of Copyright License. Subject to the terms and conditions of + * this License, each Contributor hereby grants to You a perpetual, + * worldwide, non-exclusive, no-charge, royalty-free, irrevocable + * copyright license to reproduce, prepare Derivative Works of, + * publicly display, publicly perform, sublicense, and distribute the + * Work and such Derivative Works in Source or Object form. + * + * 3. Grant of Patent License. Subject to the terms and conditions of + * this License, each Contributor hereby grants to You a perpetual, + * worldwide, non-exclusive, no-charge, royalty-free, irrevocable + * (except as stated in this section) patent license to make, have made, + * use, offer to sell, sell, import, and otherwise transfer the Work, + * where such license applies only to those patent claims licensable + * by such Contributor that are necessarily infringed by their + * Contribution(s) alone or by combination of their Contribution(s) + * with the Work to which such Contribution(s) was submitted. If You + * institute patent litigation against any entity (including a + * cross-claim or counterclaim in a lawsuit) alleging that the Work + * or a Contribution incorporated within the Work constitutes direct + * or contributory patent infringement, then any patent licenses + * granted to You under this License for that Work shall terminate + * as of the date such litigation is filed. + * + * 4. Redistribution. You may reproduce and distribute copies of the + * Work or Derivative Works thereof in any medium, with or without + * modifications, and in Source or Object form, provided that You + * meet the following conditions: + * + * (a) You must give any other recipients of the Work or + * Derivative Works a copy of this License; and + * + * (b) You must cause any modified files to carry prominent notices + * stating that You changed the files; and + * + * (c) You must retain, in the Source form of any Derivative Works + * that You distribute, all copyright, patent, trademark, and + * attribution notices from the Source form of the Work, + * excluding those notices that do not pertain to any part of + * the Derivative Works; and + * + * (d) If the Work includes a "NOTICE" text file as part of its + * distribution, then any Derivative Works that You distribute must + * include a readable copy of the attribution notices contained + * within such NOTICE file, excluding those notices that do not + * pertain to any part of the Derivative Works, in at least one + * of the following places: within a NOTICE text file distributed + * as part of the Derivative Works; within the Source form or + * documentation, if provided along with the Derivative Works; or, + * within a display generated by the Derivative Works, if and + * wherever such third-party notices normally appear. The contents + * of the NOTICE file are for informational purposes only and + * do not modify the License. You may add Your own attribution + * notices within Derivative Works that You distribute, alongside + * or as an addendum to the NOTICE text from the Work, provided + * that such additional attribution notices cannot be construed + * as modifying the License. + * + * You may add Your own copyright statement to Your modifications and + * may provide additional or different license terms and conditions + * for use, reproduction, or distribution of Your modifications, or + * for any such Derivative Works as a whole, provided Your use, + * reproduction, and distribution of the Work otherwise complies with + * the conditions stated in this License. + * + * 5. Submission of Contributions. Unless You explicitly state otherwise, + * any Contribution intentionally submitted for inclusion in the Work + * by You to the Licensor shall be under the terms and conditions of + * this License, without any additional terms or conditions. + * Notwithstanding the above, nothing herein shall supersede or modify + * the terms of any separate license agreement you may have executed + * with Licensor regarding such Contributions. + * + * 6. Trademarks. This License does not grant permission to use the trade + * names, trademarks, service marks, or product names of the Licensor, + * except as required for reasonable and customary use in describing the + * origin of the Work and reproducing the content of the NOTICE file. + * + * 7. Disclaimer of Warranty. Unless required by applicable law or + * agreed to in writing, Licensor provides the Work (and each + * Contributor provides its Contributions) on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied, including, without limitation, any warranties or conditions + * of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + * PARTICULAR PURPOSE. You are solely responsible for determining the + * appropriateness of using or redistributing the Work and assume any + * risks associated with Your exercise of permissions under this License. + * + * 8. Limitation of Liability. In no event and under no legal theory, + * whether in tort (including negligence), contract, or otherwise, + * unless required by applicable law (such as deliberate and grossly + * negligent acts) or agreed to in writing, shall any Contributor be + * liable to You for damages, including any direct, indirect, special, + * incidental, or consequential damages of any character arising as a + * result of this License or out of the use or inability to use the + * Work (including but not limited to damages for loss of goodwill, + * work stoppage, computer failure or malfunction, or any and all + * other commercial damages or losses), even if such Contributor + * has been advised of the possibility of such damages. + * + * 9. Accepting Warranty or Additional Liability. While redistributing + * the Work or Derivative Works thereof, You may choose to offer, + * and charge a fee for, acceptance of support, warranty, indemnity, + * or other liability obligations and/or rights consistent with this + * License. However, in accepting such obligations, You may act only + * on Your own behalf and on Your sole responsibility, not on behalf + * of any other Contributor, and only if You agree to indemnify, + * defend, and hold each Contributor harmless for any liability + * incurred by, or claims asserted against, such Contributor by reason + * of your accepting any such warranty or additional liability. + * + * END OF TERMS AND CONDITIONS + * + * APPENDIX: How to apply the Apache License to your work. + * + * To apply the Apache License to your work, attach the following + * boilerplate notice, with the fields enclosed by brackets "[]" + * replaced with your own identifying information. (Don't include + * the brackets!) The text should be enclosed in the appropriate + * comment syntax for the file format. We also recommend that a + * file or class name and description of purpose be included on the + * same "printed page" as the copyright notice for easier + * identification within third-party archives. + * + * Copyright [yyyy] [name of copyright owner] + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +lib/jetty-ext/jsp-api.jar +ASF + +lib/jetty-ext/jasper-runtime.jar +ASF + +lib/jetty-ext/jasper-compiler.jar +ASF + +lib/hadoop-0.19.1-core.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +APACHE HADOOP SUBCOMPONENTS: + +The Apache Hadoop project contains subcomponents with separate copyright +notices and license terms. Your use of the source code for the these +subcomponents is subject to the terms and conditions of the following +licenses. + +For the org.apache.hadoop.util.bloom.* classes: + +/** + * + * Copyright (c) 2005, European Commission project OneLab under contract + * 034819 (http://www.one-lab.org) + * All rights reserved. + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * - Neither the name of the University Catholique de Louvain - UCL + * nor the names of its contributors may be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +lib/apache-solr-common-1.3.0.jar +lib/apache-solr-solrj-1.3.0.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +========================================================================== +Portions of Jetty 6 are bundled in the Solr example server. +Jetty 6 includes a binary javax.servlet package licensed under the +Common Development and Distribution License. +-------------------------------------------------------------------------- +COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0 + +1. Definitions. + +1.1. Contributor means each individual or entity that creates or contributes to +the creation of Modifications. + +1.2. Contributor Version means the combination of the Original Software, prior +Modifications used by a Contributor (if any), and the Modifications made by +that particular Contributor. + +1.3. Covered Software means (a) the Original Software, or (b) Modifications, or +(c) the combination of files containing Original Software with files containing +Modifications, in each case including portions thereof. + +1.4. Executable means the Covered Software in any form other than Source Code. + +1.5. Initial Developer means the individual or entity that first makes Original +Software available under this License. + +1.6. Larger Work means a work which combines Covered Software or portions +thereof with code not governed by the terms of this License. + +1.7. License means this document. + +1.8. Licensable means having the right to grant, to the maximum extent +possible, whether at the time of the initial grant or subsequently acquired, +any and all of the rights conveyed herein. + +1.9. Modifications means the Source Code and Executable form of any of the +following: + +A. Any file that results from an addition to, deletion from or modification of +the contents of a file containing Original Software or previous Modifications; + +B. Any new file that contains any part of the Original Software or previous +Modification; or + +C. Any new file that is contributed or otherwise made available under the terms +of this License. + +1.10. Original Software means the Source Code and Executable form of computer +software code that is originally released under this License. + +1.11. Patent Claims means any patent claim(s), now owned or hereafter acquired, +including without limitation, method, process, and apparatus claims, in any +patent Licensable by grantor. + +1.12. Source Code means (a) the common form of computer software code in which +modifications are made and (b) associated documentation included in or with +such code. + +1.13. You (or Your) means an individual or a legal entity exercising rights +under, and complying with all of the terms of, this License. For legal +entities, You includes any entity which controls, is controlled by, or is under +common control with You. For purposes of this definition, control means (a)áthe +power, direct or indirect, to cause the direction or management of such entity, +whether by contract or otherwise, or (b)áownership of more than fifty percent +(50%) of the outstanding shares or beneficial ownership of such entity. + +2. License Grants. + +2.1. The Initial Developer Grant. Conditioned upon Your compliance with +Section 3.1 below and subject to third party intellectual property claims, the +Initial Developer hereby grants You a world-wide, royalty-free, non-exclusive +license: (a) under intellectual property rights (other than patent or +trademark) Licensable by Initial Developer, to use, reproduce, modify, display, +perform, sublicense and distribute the Original Software (or portions thereof), +with or without Modifications, and/or as part of a Larger Work; and (b) under +Patent Claims infringed by the making, using or selling of Original Software, +to make, have made, use, practice, sell, and offer for sale, and/or otherwise +dispose of the Original Software (or portions thereof). (c) The licenses +granted in Sectionsá2.1(a) and (b) are effective on the date Initial Developer +first distributes or otherwise makes the Original Software available to a third +party under the terms of this License. (d) Notwithstanding Sectioná2.1(b) +above, no patent license is granted: (1)áfor code that You delete from the +Original Software, or (2)áfor infringements caused by: (i)áthe modification of +the Original Software, or (ii)áthe combination of the Original Software with +other software or devices. + +2.2. Contributor Grant. Conditioned upon Your compliance with Section 3.1 +below and subject to third party intellectual property claims, each Contributor +hereby grants You a world-wide, royalty-free, non-exclusive license: (a) under +intellectual property rights (other than patent or trademark) Licensable by +Contributor to use, reproduce, modify, display, perform, sublicense and +distribute the Modifications created by such Contributor (or portions thereof), +either on an unmodified basis, with other Modifications, as Covered Software +and/or as part of a Larger Work; and (b) under Patent Claims infringed by the +making, using, or selling of Modifications made by that Contributor either +alone and/or in combination with its Contributor Version (or portions of such +combination), to make, use, sell, offer for sale, have made, and/or otherwise +dispose of: (1)áModifications made by that Contributor (or portions thereof); +and (2)áthe combination of Modifications made by that Contributor with its +Contributor Version (or portions of such combination). (c) The licenses +granted in Sectionsá2.2(a) and 2.2(b) are effective on the date Contributor +first distributes or otherwise makes the Modifications available to a third +party. (d) Notwithstanding Sectioná2.2(b) above, no patent license is granted: +(1)áfor any code that Contributor has deleted from the Contributor Version; +(2)áfor infringements caused by: (i)áthird party modifications of Contributor +Version, or (ii)áthe combination of Modifications made by that Contributor with +other software (except as part of the Contributor Version) or other devices; or +(3)áunder Patent Claims infringed by Covered Software in the absence of +Modifications made by that Contributor. + +3. Distribution Obligations. + +3.1. Availability of Source Code. + +Any Covered Software that You distribute or otherwise make available in +Executable form must also be made available in Source Code form and that Source +Code form must be distributed only under the terms of this License. You must +include a copy of this License with every copy of the Source Code form of the +Covered Software You distribute or otherwise make available. You must inform +recipients of any such Covered Software in Executable form as to how they can +obtain such Covered Software in Source Code form in a reasonable manner on or +through a medium customarily used for software exchange. + +3.2. Modifications. + +The Modifications that You create or to which You contribute are governed by +the terms of this License. You represent that You believe Your Modifications +are Your original creation(s) and/or You have sufficient rights to grant the +rights conveyed by this License. + +3.3. Required Notices. You must include a notice in each of Your Modifications +that identifies You as the Contributor of the Modification. You may not remove +or alter any copyright, patent or trademark notices contained within the +Covered Software, or any notices of licensing or any descriptive text giving +attribution to any Contributor or the Initial Developer. + +3.4. Application of Additional Terms. You may not offer or impose any terms on +any Covered Software in Source Code form that alters or restricts the +applicable version of this License or the recipients rights hereunder. You may +choose to offer, and to charge a fee for, warranty, support, indemnity or +liability obligations to one or more recipients of Covered Software. However, +you may do so only on Your own behalf, and not on behalf of the Initial +Developer or any Contributor. You must make it absolutely clear that any such +warranty, support, indemnity or liability obligation is offered by You alone, +and You hereby agree to indemnify the Initial Developer and every Contributor +for any liability incurred by the Initial Developer or such Contributor as a +result of warranty, support, indemnity or liability terms You offer. + +3.5. Distribution of Executable Versions. You may distribute the Executable +form of the Covered Software under the terms of this License or under the terms +of a license of Your choice, which may contain terms different from this +License, provided that You are in compliance with the terms of this License and +that the license for the Executable form does not attempt to limit or alter the +recipients rights in the Source Code form from the rights set forth in this +License. If You distribute the Covered Software in Executable form under a +different license, You must make it absolutely clear that any terms which +differ from this License are offered by You alone, not by the Initial Developer +or Contributor. You hereby agree to indemnify the Initial Developer and every +Contributor for any liability incurred by the Initial Developer or such +Contributor as a result of any such terms You offer. + +3.6. Larger Works. You may create a Larger Work by combining Covered Software +with other code not governed by the terms of this License and distribute the +Larger Work as a single product. In such a case, You must make sure the +requirements of this License are fulfilled for the Covered Software. + +4. Versions of the License. + +4.1. New Versions. Sun Microsystems, Inc. is the initial license steward and +may publish revised and/or new versions of this License from time to time. Each +version will be given a distinguishing version number. Except as provided in +Section 4.3, no one other than the license steward has the right to modify this +License. + +4.2. Effect of New Versions. + +You may always continue to use, distribute or otherwise make the Covered +Software available under the terms of the version of the License under which +You originally received the Covered Software. If the Initial Developer includes +a notice in the Original Software prohibiting it from being distributed or +otherwise made available under any subsequent version of the License, You must +distribute and make the Covered Software available under the terms of the +version of the License under which You originally received the Covered +Software. Otherwise, You may also choose to use, distribute or otherwise make +the Covered Software available under the terms of any subsequent version of the +License published by the license steward. 4.3. Modified Versions. + +When You are an Initial Developer and You want to create a new license for Your +Original Software, You may create and use a modified version of this License if +You: (a)árename the license and remove any references to the name of the +license steward (except to note that the license differs from this License); +and (b)áotherwise make it clear that the license contains terms which differ +from this License. + +5. DISCLAIMER OF WARRANTY. + +COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN AS IS BASIS, WITHOUT +WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT +LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE IS FREE OF DEFECTS, +MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK +AS TO THE QUALITY AND PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD +ANY COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL +DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, +REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART +OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT +UNDER THIS DISCLAIMER. + +6. TERMINATION. + +6.1. This License and the rights granted hereunder will terminate automatically +if You fail to comply with terms herein and fail to cure such breach within 30 +days of becoming aware of the breach. Provisions which, by their nature, must +remain in effect beyond the termination of this License shall survive. + +6.2. If You assert a patent infringement claim (excluding declaratory judgment +actions) against Initial Developer or a Contributor (the Initial Developer or +Contributor against whom You assert such claim is referred to as Participant) +alleging that the Participant Software (meaning the Contributor Version where +the Participant is a Contributor or the Original Software where the Participant +is the Initial Developer) directly or indirectly infringes any patent, then any +and all rights granted directly or indirectly to You by such Participant, the +Initial Developer (if the Initial Developer is not the Participant) and all +Contributors under Sectionsá2.1 and/or 2.2 of this License shall, upon 60 days +notice from Participant terminate prospectively and automatically at the +expiration of such 60 day notice period, unless if within such 60 day period +You withdraw Your claim with respect to the Participant Software against such +Participant either unilaterally or pursuant to a written agreement with +Participant. + +6.3. In the event of termination under Sectionsá6.1 or 6.2 above, all end user +licenses that have been validly granted by You or any distributor hereunder +prior to termination (excluding licenses granted to You by any distributor) +shall survive termination. + +7. LIMITATION OF LIABILITY. + +UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING +NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL DEVELOPER, ANY +OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED SOFTWARE, OR ANY SUPPLIER OF +ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, +INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT +LIMITATION, DAMAGES FOR LOST PROFITS, LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER +FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN +IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS +LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL +INJURY RESULTING FROM SUCH PARTYS NEGLIGENCE TO THE EXTENT APPLICABLE LAW +PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR +LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND +LIMITATION MAY NOT APPLY TO YOU. + +8. U.S. GOVERNMENT END USERS. + +The Covered Software is a commercial item, as that term is defined in +48áC.F.R.á2.101 (Oct. 1995), consisting of commercial computer software (as +that term is defined at 48 C.F.R. á252.227-7014(a)(1)) and commercial computer +software documentation as such terms are used in 48áC.F.R.á12.212 (Sept. 1995). +Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 +(June 1995), all U.S. Government End Users acquire Covered Software with only +those rights set forth herein. This U.S. Government Rights clause is in lieu +of, and supersedes, any other FAR, DFAR, or other clause or provision that +addresses Government rights in computer software under this License. + +9. MISCELLANEOUS. + +This License represents the complete agreement concerning subject matter +hereof. If any provision of this License is held to be unenforceable, such +provision shall be reformed only to the extent necessary to make it +enforceable. This License shall be governed by the law of the jurisdiction +specified in a notice contained within the Original Software (except to the +extent applicable law, if any, provides otherwise), excluding such +jurisdictions conflict-of-law provisions. Any litigation relating to this +License shall be subject to the jurisdiction of the courts located in the +jurisdiction and venue specified in a notice contained within the Original +Software, with the losing party responsible for costs, including, without +limitation, court costs and reasonable attorneys fees and expenses. The +application of the United Nations Convention on Contracts for the International +Sale of Goods is expressly excluded. Any law or regulation which provides that +the language of a contract shall be construed against the drafter shall not +apply to this License. You agree that You alone are responsible for compliance +with the United States export administration regulations (and the export +control laws and regulation of any other countries) when You use, distribute or +otherwise make available any Covered Software. + +10. RESPONSIBILITY FOR CLAIMS. + +As between Initial Developer and the Contributors, each party is responsible +for claims and damages arising, directly or indirectly, out of its utilization +of rights under this License and You agree to work with Initial Developer and +Contributors to distribute such responsibility on an equitable basis. Nothing +herein is intended or shall be deemed to constitute any admission of liability. + +NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE +(CDDL) The GlassFish code released under the CDDL shall be governed by the laws +of the State of California (excluding conflict-of-law provisions). Any +litigation relating to this License shall be subject to the jurisdiction of the +Federal Courts of the Northern District of California and the state courts of +the State of California, with venue lying in Santa Clara County, California. + + +========================================================================== +The following license applies to parts of the lucene-snowball jar +that are generated from the snowball sources at http://snowball.tartarus.org/ +-------------------------------------------------------------------------- +The BSD License + +Copyright (c) 2001, Dr Martin Porter, Copyright (c) 2002, Richard Boulton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +========================================================================== +The following license applies to easymock.jar +-------------------------------------------------------------------------- +EasyMock 2 License (MIT License) +Copyright (c) 2001-2007 OFFIS, Tammo Freese. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +========================================================================== +The following license applies to the JQuery JavaScript library +-------------------------------------------------------------------------- +Copyright (c) 2008 John Resig, http://jquery.com/ + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +========================================================================== +The following license applies to stax-utils.jar +-------------------------------------------------------------------------- +Copyright (c) 2004, Christian Niles, unit12.net +Copyright (c) 2004, Sun Microsystems, Inc. +Copyright (c) 2006, John Kristian +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the listed copyright holders nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +========================================================================== +The following license applies to JUnit +-------------------------------------------------------------------------- +Common Public License - v 1.0 + +THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + + a) in the case of the initial Contributor, the initial code and documentation distributed under this Agreement, and + b) in the case of each subsequent Contributor: + + i) changes to the Program, and + + ii) additions to the Program; + + where such changes and/or additions to the Program originate from and are distributed by that particular Contributor. A Contribution 'originates' from a Contributor if it was added to the Program by such Contributor itself or anyone acting on such Contributor's behalf. Contributions do not include additions to the Program which: (i) are separate modules of software distributed in conjunction with the Program under their own license agreement, and (ii) are not derivative works of the Program. + +"Contributor" means any person or entity that distributes the Program. + +"Licensed Patents " mean patent claims licensable by a Contributor which are necessarily infringed by the use or sale of its Contribution alone or when combined with the Program. + +"Program" means the Contributions distributed in accordance with this Agreement. + +"Recipient" means anyone who receives the Program under this Agreement, including all Contributors. + +2. GRANT OF RIGHTS + + a) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, distribute and sublicense the Contribution of such Contributor, if any, and such derivative works, in source code and object code form. + + b) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed Patents to make, use, sell, offer to sell, import and otherwise transfer the Contribution of such Contributor, if any, in source code and object code form. This patent license shall apply to the combination of the Contribution and the Program if, at the time the Contribution is added by the Contributor, such addition of the Contribution causes such combination to be covered by the Licensed Patents. The patent license shall not apply to any other combinations which include the Contribution. No hardware per se is licensed hereunder. + + c) Recipient understands that although each Contributor grants the licenses to its Contributions set forth herein, no assurances are provided by any Contributor that the Program does not infringe the patent or other intellectual property rights of any other entity. Each Contributor disclaims any liability to Recipient for claims brought by any other entity based on infringement of intellectual property rights or otherwise. As a condition to exercising the rights and licenses granted hereunder, each Recipient hereby assumes sole responsibility to secure any other intellectual property rights needed, if any. For example, if a third party patent license is required to allow Recipient to distribute the Program, it is Recipient's responsibility to acquire that license before distributing the Program. + + d) Each Contributor represents that to its knowledge it has sufficient copyright rights in its Contribution, if any, to grant the copyright license set forth in this Agreement. + +3. REQUIREMENTS + +A Contributor may choose to distribute the Program in object code form under its own license agreement, provided that: + + a) it complies with the terms and conditions of this Agreement; and + + b) its license agreement: + + i) effectively disclaims on behalf of all Contributors all warranties and conditions, express and implied, including warranties or conditions of title and non-infringement, and implied warranties or conditions of merchantability and fitness for a particular purpose; + + ii) effectively excludes on behalf of all Contributors all liability for damages, including direct, indirect, special, incidental and consequential damages, such as lost profits; + + iii) states that any provisions which differ from this Agreement are offered by that Contributor alone and not by any other party; and + + iv) states that source code for the Program is available from such Contributor, and informs licensees how to obtain it in a reasonable manner on or through a medium customarily used for software exchange. + +When the Program is made available in source code form: + + a) it must be made available under this Agreement; and + + b) a copy of this Agreement must be included with each copy of the Program. + +Contributors may not remove or alter any copyright notices contained within the Program. + +Each Contributor must identify itself as the originator of its Contribution, if any, in a manner that reasonably allows subsequent Recipients to identify the originator of the Contribution. + +4. COMMERCIAL DISTRIBUTION + +Commercial distributors of software may accept certain responsibilities with respect to end users, business partners and the like. While this license is intended to facilitate the commercial use of the Program, the Contributor who includes the Program in a commercial product offering should do so in a manner which does not create potential liability for other Contributors. Therefore, if a Contributor includes the Program in a commercial product offering, such Contributor ("Commercial Contributor") hereby agrees to defend and indemnify every other Contributor ("Indemnified Contributor") against any losses, damages and costs (collectively "Losses") arising from claims, lawsuits and other legal actions brought by a third party against the Indemnified Contributor to the extent caused by the acts or omissions of such Commercial Contributor in connection with its distribution of the Program in a commercial product offering. The obligations in this section do not apply to any claims or Losses relating to any actual or alleged intellectual property infringement. In order to qualify, an Indemnified Contributor must: a) promptly notify the Commercial Contributor in writing of such claim, and b) allow the Commercial Contributor to control, and cooperate with the Commercial Contributor in, the defense and any related settlement negotiations. The Indemnified Contributor may participate in any such claim at its own expense. + +For example, a Contributor might include the Program in a commercial product offering, Product X. That Contributor is then a Commercial Contributor. If that Commercial Contributor then makes performance claims, or offers warranties related to Product X, those performance claims and warranties are such Commercial Contributor's responsibility alone. Under this section, the Commercial Contributor would have to defend claims against the other Contributors related to those performance claims and warranties, and if a court requires any other Contributor to pay any damages as a result, the Commercial Contributor must pay those damages. + +5. NO WARRANTY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the appropriateness of using and distributing the Program and assumes all risks associated with its exercise of rights under this Agreement, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and unavailability or interruption of operations. + +6. DISCLAIMER OF LIABILITY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement, and without further action by the parties hereto, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against a Contributor with respect to a patent applicable to software (including a cross-claim or counterclaim in a lawsuit), then any patent licenses granted by that Contributor to such Recipient under this Agreement shall terminate as of the date such litigation is filed. In addition, if Recipient institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program itself (excluding combinations of the Program with other software or hardware) infringes such Recipient's patent(s), then such Recipient's rights granted under Section 2(b) shall terminate as of the date such litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it fails to comply with any of the material terms or conditions of this Agreement and does not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If all Recipient's rights under this Agreement terminate, Recipient agrees to cease use and distribution of the Program as soon as reasonably practicable. However, Recipient's obligations under this Agreement and any licenses granted by Recipient relating to the Program shall continue and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, but in order to avoid inconsistency the Agreement is copyrighted and may only be modified in the following manner. The Agreement Steward reserves the right to publish new versions (including revisions) of this Agreement from time to time. No one other than the Agreement Steward has the right to modify this Agreement. IBM is the initial Agreement Steward. IBM may assign the responsibility to serve as the Agreement Steward to a suitable separate entity. Each new version of the Agreement will be given a distinguishing version number. The Program (including Contributions) may always be distributed subject to the version of the Agreement under which it was received. In addition, after a new version of the Agreement is published, Contributor may elect to distribute the Program (including its Contributions) under the new version. Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no rights or licenses to the intellectual property of any Contributor under this Agreement, whether expressly, by implication, estoppel or otherwise. All rights in the Program not expressly granted under this Agreement are reserved. + +This Agreement is governed by the laws of the State of New York and the intellectual property laws of the United States of America. No party to this Agreement will bring a legal action under this Agreement more than one year after the cause of action arose. Each party waives its rights to a jury trial in any resulting litigation. + + +lib/xerces-2_6_2-apis.jar + +xml-commons/java/external/LICENSE.sax.txt $Id: LICENSE.sax.txt,v 1.1 2002/01/31 23:26:48 curcuru Exp $ + + +This license came from: http://www.megginson.com/SAX/copying.html + However please note future versions of SAX may be covered + under http://saxproject.org/?selected=pd + + +This page is now out of date -- see the new SAX site at +http://www.saxproject.org/ for more up-to-date +releases and other information. Please change your bookmarks. + + +SAX2 is Free! + +I hereby abandon any property rights to SAX 2.0 (the Simple API for +XML), and release all of the SAX 2.0 source code, compiled code, and +documentation contained in this distribution into the Public Domain. +SAX comes with NO WARRANTY or guarantee of fitness for any +purpose. + +David Megginson, david@megginson.com +2000-05-05 + + +xml-commons/java/external/LICENSE.dom-software.txt $Id: LICENSE.dom-software.txt,v 1.1 2002/01/31 23:13:42 curcuru Exp $ + + +This license came from: http://www.w3.org/Consortium/Legal/copyright-software-19980720 + + +W3C® SOFTWARE NOTICE AND LICENSE +Copyright © 1994-2001 World +Wide Web Consortium, World +Wide Web Consortium, (Massachusetts Institute of +Technology, Institut National de +Recherche en Informatique et en Automatique, Keio University). All Rights Reserved. +http://www.w3.org/Consortium/Legal/ + +This W3C work (including software, documents, or other related +items) is being provided by the copyright holders under the +following license. By obtaining, using and/or copying this work, +you (the licensee) agree that you have read, understood, and will +comply with the following terms and conditions: +Permission to use, copy, modify, and distribute this software +and its documentation, with or without modification,  for any +purpose and without fee or royalty is hereby granted, provided that +you include the following on ALL copies of the software and +documentation or portions thereof, including modifications, that +you make: + +The full text of this NOTICE in a location viewable to users of +the redistributed or derivative work. + +Any pre-existing intellectual property disclaimers, notices, or +terms and conditions. If none exist, a short notice of the +following form (hypertext is preferred, text is permitted) should +be used within the body of any redistributed or derivative code: +"Copyright © [$date-of-software] World Wide Web Consortium, (Massachusetts Institute of +Technology, Institut National de +Recherche en Informatique et en Automatique, Keio University). All Rights Reserved. +http://www.w3.org/Consortium/Legal/" + +Notice of any changes or modifications to the W3C files, +including the date changes were made. (We recommend you provide +URIs to the location from which the code is derived.) + +THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND +COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF +MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE +USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD +PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. +COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, +SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE +SOFTWARE OR DOCUMENTATION. + +The name and trademarks of copyright holders may NOT be used in +advertising or publicity pertaining to the software without +specific, written prior permission. Title to copyright in this +software and any associated documentation will at all times remain +with copyright holders. +____________________________________ +This formulation of W3C's notice and license became active on +August 14 1998 so as to improve compatibility with GPL. This +version ensures that W3C software licensing terms are no more +restrictive than GPL and consequently W3C software may be +distributed in GPL packages. See the older formulation for the +policy prior to this date. Please see our Copyright FAQ for common +questions about using materials from +our site, including specific terms and conditions for packages like +libwww, Amaya, and Jigsaw. +Other questions about this notice can be +directed to site-policy@w3.org. + +webmaster +(last updated $Date: 2002/01/31 23:13:42 $) + +lib/commons-cli-2.0-SNAPSHOT.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +lib/commons-codec-1.3.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +lib/xerces-2_6_2.jar +ASF + +lib/jetty-5.1.4.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +lib/commons-lang-2.1.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +lib/commons-beanutils-1.8.0.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +lib/commons-logging-api-1.0.4.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +lib/servlet-api.jar +ASF + +lib/jets3t-0.6.1.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +lib/tika-0.1-incubating.jar + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +lib/junit-3.8.1.jar + +Common Public License Version 1.0 + +THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + + a) in the case of the initial Contributor, the initial code and documentation distributed under this Agreement, and + + b) in the case of each subsequent Contributor: + + i) changes to the Program, and + + ii) additions to the Program; + + where such changes and/or additions to the Program originate from and are distributed by that particular Contributor. A Contribution 'originates' from a Contributor if it was added to the Program by such Contributor itself or anyone acting on such Contributor's behalf. Contributions do not include additions to the Program which: (i) are separate modules of software distributed in conjunction with the Program under their own license agreement, and (ii) are not derivative works of the Program. + +"Contributor" means any person or entity that distributes the Program. + +"Licensed Patents " mean patent claims licensable by a Contributor which are necessarily infringed by the use or sale of its Contribution alone or when combined with the Program. + +"Program" means the Contributions distributed in accordance with this Agreement. + +"Recipient" means anyone who receives the Program under this Agreement, including all Contributors. + +2. GRANT OF RIGHTS + + a) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, distribute and sublicense the Contribution of such Contributor, if any, and such derivative works, in source code and object code form. + + b) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed Patents to make, use, sell, offer to sell, import and otherwise transfer the Contribution of such Contributor, if any, in source code and object code form. This patent license shall apply to the combination of the Contribution and the Program if, at the time the Contribution is added by the Contributor, such addition of the Contribution causes such combination to be covered by the Licensed Patents. The patent license shall not apply to any other combinations which include the Contribution. No hardware per se is licensed hereunder. + + c) Recipient understands that although each Contributor grants the licenses to its Contributions set forth herein, no assurances are provided by any Contributor that the Program does not infringe the patent or other intellectual property rights of any other entity. Each Contributor disclaims any liability to Recipient for claims brought by any other entity based on infringement of intellectual property rights or otherwise. As a condition to exercising the rights and licenses granted hereunder, each Recipient hereby assumes sole responsibility to secure any other intellectual property rights needed, if any. For example, if a third party patent license is required to allow Recipient to distribute the Program, it is Recipient's responsibility to acquire that license before distributing the Program. + + d) Each Contributor represents that to its knowledge it has sufficient copyright rights in its Contribution, if any, to grant the copyright license set forth in this Agreement. + +3. REQUIREMENTS + +A Contributor may choose to distribute the Program in object code form under its own license agreement, provided that: + + a) it complies with the terms and conditions of this Agreement; and + + b) its license agreement: + + i) effectively disclaims on behalf of all Contributors all warranties and conditions, express and implied, including warranties or conditions of title and non-infringement, and implied warranties or conditions of merchantability and fitness for a particular purpose; + + ii) effectively excludes on behalf of all Cntributors all liability for damages, including direct, indirect, special, incidental and consequential damages, such as lost profits; + + iii) states that any provisions which differ from this Agreement are offered by that Contributor alone and not by any other party; and + + iv) states that source code for the Program is available from such Contributor, and informs licensees how to obtain it in a reasonable manner on or through a medium customarily used for software exchange. + +When the Program is made available in source code form: + + a) it must be made available under this Agreement; and + + b) a copy of this Agreement must be included with each copy of the Program. + +Contributors may not remove or alter any copyright notices contained within the Program. + +Each Contributor must identify itself as the originator of its Contribution, if any, in a manner that reasonably allows subsequent Recipients to identify the originator of the Contribution. + +4. COMMERCIAL DISTRIBUTION + +Commercial distributors of software may accept certain responsibilities with respect to end users, business partners and the like. While this license is intended to facilitate the commercial use of the Program, the Contributor who includes the Program in a commercial product offering should do so in a manner which does not create potential liability for other Contributors. Therefore, if a Contributor includes the Program in a commercial product offering, such Contributor ("Commercial Contributor") hereby agrees to defend and indemnify every other Contributor ("Indemnified Contributor") against any losses, damages and costs (collectively "Losses") arising from claims, lawsuits and other legal actions brought by a third party against the Indemnified Contributor to the extent caused by the acts or omissions of such Commercial Contributor in connection with its distribution of the Program in a commercial product offering. The obligations in this section do not apply to any claims or Losses relating to any actual or alleged intellectual property infringement. In order to qualify, an Indemnified Contributor must: a) promptly notify the Commercial Contributor in writing of such claim, and b) allow the Commercial Contributor to control, and cooperate with the Commercial Contributor in, the defense and any related settlement negotiations. The Indemnified Contributor may participate in any such claim at its own expense. + +For example, a Contributor might include the Program in a commercial product offering, Product X. That Contributor is then a Commercial Contributor. If that Commercial Contributor then makes performance claims, or offers warranties related to Product X, those performance claims and warranties are such Commercial Contributor's responsibility alone. Under this section, the Commercial Contributor would have to defend claims against the other Contributors related to those performance claims and warranties, and if a court requires any other Contributor to pay any damages as a result, the Commercial Contributor must pay those damages. + +5. NO WARRANTY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the appropriateness of using and distributing the Program and assumes all risks associated with its exercise of rights under this Agreement, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and unavailability or interruption of operations. + +6. DISCLAIMER OF LIABILITY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PR LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement, and without further action by the parties hereto, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against a Contributor with respect to a patent applicable to software (including a cross-claim or counterclaim in a lawsuit), then any patent licenses granted by that Contributor to such Recipient under this Agreement shall terminate as of the date such litigation is filed. In addition, if Recipient institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program itself (excluding combinations of the Program with other software or hardware) infringes such Recipient's patent(s), then such Recipient's rights granted under Section 2(b) shall terminate as of the date such litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it fails to comply with any of the material terms or conditions of this Agreement and does not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If all Recipient's rights under this Agreement terminate, Recipient agrees to cease use and distribution of the Program as soon as reasonably practicable. However, Recipient's obligations under this Agreement and any licenses granted by Recipient relating to the Program shall continue and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, but in order to avoid inconsistency the Agreement is copyrighted and may only be modified in the following manner. The Agreement Steward reserves the right to publish new versions (including revisions) of this Agreement from time to time. No one other than the Agreement Steward has the right to modify this Agreement. IBM is the initial Agreement Steward. IBM may assign the responsibility to serve as the Agreement Steward to a suitable separate entity. Each new version of the Agreement will be given a distinguishing version number. The Program (including Contributions) may always be distributed subject to the version of the Agreement under which it was received. In addition, after a new version of the Agreement is published, Contributor may elect to distribute the Program (including its Contributions) under the new version. Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no rights or licenses to the intellectual property of any Contributor under this Agreement, whether expressly, by implication, estoppel or otherwise. All rights in the Program not expressly granted under this Agreement are reserved. + +This Agreement is governed by the laws of the State of New York and the intellectual property laws of the United States of America. No party to this Agreement will bring a legal action under this Agreement more than one year after the cause of action arose. Each party waives its rights to a jury trial in any resulting litigation. +OFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement, and without further action by the parties hereto, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against a Contributor with respect to a patent applicable to software (including a cross-claim or counterclaim in a lawsuit), then any patent licenses granted by that Contributor to such Recipient under this Agreement shall terminate as of the date such litigation is filed. In addition, if Recipient institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program itself (excluding combinations of the Program with other software or hardware) infringes such Recipient's patent(s), then such Recipient's rights granted under Section 2(b) shall terminate as of the date such litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it fails to comply with any of the material terms or conditions of this Agreement and does not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If all Recipient's rights under this Agreement terminate, Recipient agrees to cease use and distribution of the Program as soon as reasonably practicable. However, Recipient's obligations under this Agreement and any licenses granted by Recipient relating to the Program shall continue and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, but in order to avoid inconsistency the Agreement is copyrighted and may only be modified in the following manner. The Agreement Steward reserves the right to publish new versions (including revisions) of this Agreement from time to time. No one other than the Agreement Steward has the right to modify this Agreement. IBM is the initial Agreement Steward. IBM may assign the responsibility to serve as the Agreement Steward to a suitable separate entity. Each new version of the Agreement will be given a distinguishing version number. The Program (including Contributions) may always be distributed subject to the version of the Agreement under which it was received. In addition, after a new version of the Agreement is published, Contributor may elect to distribute the Program (including its Contributions) under the new version. Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no rights or licenses to the intellectual property of any Contributor under this Agreement, whether expressly, by implication, estoppel or otherwise. All rights in the Program not expressly granted under this Agreement are reserved. + +This Agreement is governed by the laws of the State of New York and the intellectual property laws of the United States of America. No party to this Agreement will bring a legal action under this Agreement more than one year after the cause of action arose. Each party waives its rights to a jury trial in any resulting litigation. + +lib/taglibs-i18n.jar +ASF + +lib/log4j-1.2.15.jar +ASF + +src/plugin/feed/lib/rome-0.9.jar +Copyright 2004 Sun Microsystems, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + + +Office Open XML (OOXML) xsds: +----------------------------- + +These were downloaded as part of the Office Open XML ECMA Specification +from + +These are included within the Apache POI distribution, and are available +under compatible licensing terms. + +Copyright - ECMA International, "made available without restriction" + http://www.ecma-international.org/memento/Ecmabylaws.htm - section 9.4 +Patent License - Microsoft Open Specification Promise (OSP) + http://www.microsoft.com/interop/osp/ + + +src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + + +Office Open XML (OOXML) xsds: +----------------------------- + +These were downloaded as part of the Office Open XML ECMA Specification +from + +These are included within the Apache POI distribution, and are available +under compatible licensing terms. + +Copyright - ECMA International, "made available without restriction" + http://www.ecma-international.org/memento/Ecmabylaws.htm - section 9.4 +Patent License - Microsoft Open Specification Promise (OSP) + http://www.microsoft.com/interop/osp/ + +src/plugin/urlfilter-automaton/lib/automaton.jar + +dk.brics.automaton +------------------ + +Copyright (C) 2001-2004 Anders Moeller + +This source code in this package may be used under the terms of the +BSD license. Please read the file 'COPYING' for details. + +This package contains a full DFA/NFA implementation with Unicode +alphabet and support for all standard regular expression operations. + +For more information, go to the package home page at +http://www.brics.dk/~amoeller/automaton/ + + +Anders Moeller +amoeller@brics.dk + +src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar + +The CyberNeko Software License, Version 1.0 + + +(C) Copyright 2002,2003, Andy Clark. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +3. The end-user documentation included with the redistribution, + if any, must include the following acknowledgment: + "This product includes software developed by Andy Clark." + Alternately, this acknowledgment may appear in the software itself, + if and wherever such third-party acknowledgments normally appear. + +4. The names "CyberNeko" and "NekoHTML" must not be used to endorse + or promote products derived from this software without prior + written permission. For written permission, please contact + andy@cyberneko.net. + +5. Products derived from this software may not be called "CyberNeko", + nor may "CyberNeko" appear in their name, without prior written + permission of the author. + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR OTHER CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +==================================================================== + +This license is based on the Apache Software License, version 1.1. + + +src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar + +Copyright (c) Michael Schmeling 1998, 2000 - All Rights Reserved + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, provided that the above copyright notice(s) and this +permission notice appear in all copies of the Software and that both the +above copyright notice(s) and this permission notice appear in supporting +documentation. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE +BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, +OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in this Software without prior written authorization of the +copyright holder. + +src/plugin/clustering-carrot2/lib/carrot2-util-common.jar +src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar +src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar +src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar +src/plugin/clustering-carrot2/lib/carrot2-local-core.jar + + +Carrot2 Project + +Copyright (C) 2002-2006, Dawid Weiss, Stanis�aw Osi�ski. +Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +- Neither the name of the Poznan University of Technology, Poznan, Poland nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +- We request that you include in the end-user documentation provided with the + redistribution and/or in the software itself an acknowledgement equivalent to + the following: "This product includes software developed by the Carrot2 + Project." + +- No algorithms or technical solutions in the project may be patented or claimed + proprietary. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +src/plugin/clustering-carrot2/lib/commons-collections-3.2.jar +ASF + +src/plugin/clustering-carrot2/lib/Jama-1.0.2.jar + +Copyright Notice This software is a cooperative product of The MathWorks and the +National Institute of Standards and Technology (NIST) which has been released to +the public domain. Neither The MathWorks nor NIST assumes any responsibility +whatsoever for its use by other parties, and makes no guarantees, expressed or +implied, about its quality, reliability, or any other characteristic. + +src/plugin/clustering-carrot2/lib/commons-pool-1.3.jar +ASF + +src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar +ASF + +src/plugin/ontology/lib/jena-2.1.jar + +/* + * (c) Copyright 2000, 2001, 2002, 2003, 2004 Hewlett-Packard Development Company, LP + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +This product includes software developed by the +Apache Software Foundation (http://www.apache.org/). + +src/plugin/parse-swf/lib/javaswf.jar + + + Copyright (c) 2001-2005, David N. Main, All rights reserved. + + Redistribution and use in source and binary forms, with or + without modification, are permitted provided that the + following conditions are met: + + 1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + + 2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + 3. The name of the author may not be used to endorse or + promote products derived from this software without specific + prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +src/plugin/response-json/lib/json-lib-2.2.2-jdk15.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +src/plugin/response-json/lib/ezmorph-1.0.6.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +src/plugin/parse-html/lib/tagsoup-1.2.jar + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +src/plugin/parse-pdf/lib/PDFBox-0.7.4-dev.jar + +Copyright (c) 2003-2005, www.pdfbox.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the name of pdfbox; nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +src/plugin/parse-pdf/lib/JempBox-0.2.0.jar + +Copyright (c) 2006-2007, www.jempbox.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the name of pdfbox; nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +src/plugin/parse-pdf/lib/FontBox-0.2.0-dev.jar + +Copyright (c) 2003-2005, www.fontbox.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the name of fontbox; nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +src/plugin/parse-pdf/lib/bcprov-jdk14-132.jar + +Copyright (c) 2000 - 2008 The Legion Of The Bouncy Castle (http://www.bouncycastle.org) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +src/plugin/lib-xml/lib/jdom.jar + +/*-- + + $Id: LICENSE.txt,v 1.11 2004/02/06 09:32:57 jhunter Exp $ + + Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer that follows + these conditions in the documentation and/or other materials + provided with the distribution. + + 3. The name "JDOM" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact . + + 4. Products derived from this software may not be called "JDOM", nor + may "JDOM" appear in their name, without prior written permission + from the JDOM Project Management . + + In addition, we request (but do not require) that you include in the + end-user documentation provided with the redistribution and/or in the + software itself an acknowledgement equivalent to the following: + "This product includes software developed by the + JDOM Project (http://www.jdom.org/)." + Alternatively, the acknowledgment may be graphical using the logos + available at http://www.jdom.org/images/logos. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the JDOM Project and was originally + created by Jason Hunter and + Brett McLaughlin . For more information + on the JDOM Project, please see . + + */ + +src/plugin/lib-xml/lib/jaxen-jdom.jar +src/plugin/lib-xml/lib/jaxen-core.jar + +/* + $Id: LICENSE.txt 1128 2006-02-05 21:49:04Z elharo $ + + Copyright 2003-2006 The Werken Company. All Rights Reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the Jaxen Project nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + */ + + +src/plugin/lib-xml/lib/saxpath.jar + +/*-- + + $Id: LICENSE,v 1.1 2002/04/26 17:43:56 jstrachan Exp $ + + Copyright (C) 2000-2002 werken digital. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer that follows + these conditions in the documentation and/or other materials + provided with the distribution. + + 3. The name "SAXPath" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact license@saxpath.org. + + 4. Products derived from this software may not be called "SAXPath", nor + may "SAXPath" appear in their name, without prior written permission + from the SAXPath Project Management (pm@saxpath.org). + + In addition, we request (but do not require) that you include in the + end-user documentation provided with the redistribution and/or in the + software itself an acknowledgement equivalent to the following: + "This product includes software developed by the + SAXPath Project (http://www.saxpath.org/)." + Alternatively, the acknowledgment may be graphical using the logos + available at http://www.saxpath.org/ + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE SAXPath AUTHORS OR THE PROJECT + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the SAXPath Project and was originally + created by bob mcwhirter and + James Strachan . For more information on the + SAXPath Project, please see . + + */ + +src/plugin/lib-xml/lib/xercesImpl.jar + +ASF + + + diff --git a/apache-nutch-2.3/NOTICE.txt b/apache-nutch-2.3/NOTICE.txt new file mode 100644 index 0000000..4b119e5 --- /dev/null +++ b/apache-nutch-2.3/NOTICE.txt @@ -0,0 +1,15 @@ +Apache Nutch +Copyright 2015 The Apache Software Foundation + +This product includes software developed by The Apache Software +Foundation (http://www.apache.org/). + +This product includes software developed by the following copyright owners: + +Nutch includes JavaSWF: +Copyright (c) 2001-2005, David N. Main, All rights reserved. + +Nutch includes Automaton: +This package is Copyright © 2001-2008 Anders Møller. All rights reserved. + + diff --git a/apache-nutch-2.3/README.txt b/apache-nutch-2.3/README.txt new file mode 100644 index 0000000..c80bcfb --- /dev/null +++ b/apache-nutch-2.3/README.txt @@ -0,0 +1,36 @@ +Apache Nutch README + +For the latest information about Nutch, please visit our website at: + + http://nutch.apache.org + +and our wiki, at: + + http://wiki.apache.org/nutch/ + +To get started using Nutch read Tutorial: + + http://wiki.apache.org/nutch/Nutch2Tutorial + +Export Control + +This distribution includes cryptographic software. The country in which you +currently reside may have restrictions on the import, possession, use, and/or +re-export to another country, of encryption software. BEFORE using any encryption +software, please check your country's laws, regulations and policies concerning the +import, possession, or use, and re-export of encryption software, to see if this is +permitted. See for more information. + +The U.S. Government Department of Commerce, Bureau of Industry and Security (BIS), has +classified this software as Export Commodity Control Number (ECCN) 5D002.C.1, which +includes information security software using or performing cryptographic functions with +asymmetric algorithms. The form and manner of this Apache Software Foundation +distribution makes it eligible for export under the License Exception ENC Technology +Software Unrestricted (TSU) exception (see the BIS Export Administration Regulations, +Section 740.13) for both object code and source code. + +The following provides more details on the included cryptographic software: + +Apache Nutch uses the PDFBox API in its parse-tika plugin for extracting textual content +and metadata from encrypted PDF files. See http://pdfbox.apache.org for more +details on PDFBox. diff --git a/apache-nutch-2.3/build.xml b/apache-nutch-2.3/build.xml new file mode 100644 index 0000000..12a575c --- /dev/null +++ b/apache-nutch-2.3/build.xml @@ -0,0 +1,1043 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tests failed! + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + You need Apache Ivy 2.0 or later from http://ant.apache.org/ + It could not be loaded from ${ivy.repo.url} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rel="tag" · Microformats Wiki + + + + + +
+ + + + + +
+ +
+ + + +

rel="tag"

+ +

+ + +
Jump to: navigation, + search +
+ +
+

Contents

[hide]
+ +
+

Draft Specification 2005-01-10

+
Editors/Authors +
Tantek Çelik +
Kevin Marks +
+
Concept +
Derek Powazek +
+
short URL +
http://tr.im/reltag +
+

Copyright

+

This specification is (C) 2004-2012 by the authors. However, the +authors intend to submit this specification to a standards body with a +liberal copyright/licensing policy such as the GMPG, IETF, and/or W3C. Anyone wishing to contribute should read their copyright principles, policies and licenses (e.g. the GMPG Principles) and agree to them, including licensing of all contributions under all required licenses (e.g. CC-by 1.0 and later), before contributing. +

+
  • Tantek: + I release all my contributions to this specification into the public +domain and I encourage the other authors to do so as well. +
  • Kevin Marks: + I release all my contributions to this specification into the public +domain and I encourage the other authors to do so as well. +
    • When all authors/editors have done so, we can remove the +MicroFormatCopyrightStatement template reference and replace it with the + MicroFormatPublicDomainContributionStatement. +
    +
+

Patents

+

This specification is subject to a royalty free patent policy, e.g. per the W3C Patent Policy, and IETF RFC3667 & RFC3668. +

+

Abstract

+

Rel-Tag is one of several MicroFormats. By adding rel="tag" + to a hyperlink, a page indicates that the destination of that hyperlink + is an author-designated "tag" (or keyword/subject) for the current +page. Note that a tag may just refer to a major portion of the current +page (i.e. a blog post). e.g. by placing this link on a page, +

+
<a href="http://technorati.com/tag/tech" rel="tag">tech</a>
+

the author indicates that the page (or some portion of the page) has the tag "tech". +

The linked page SHOULD exist, and it is the linked page, rather +than the link text, that defines the tag. The last path component of the + URL +is the text of the tag, so +

+
<a href="http://technorati.com/tag/tech" rel="tag">fish</a>
+

would indicate the tag "tech" rather than "fish". +

+

Scope

+

rel="tag" is specifically designed for "tagging" content, typically web pages (or portions thereof, like blog posts). +

rel="tag" is NOT designed for "tagging" arbitrary URLs or +external content. There is demand for a general decentralized syntax +for tagging URLs external to the current page, but this is not meant for + that. See xFolk and hReview for ways to tag arbitrary URLs. +

If you need to define tags as part of a more specialized format, rel="tag" is the recommended way to do so, and xFolk, hReview, hCard, hCalendar and hRecipe all do this. +

+

XMDP profile

+

See rel-tag-profile. +

+

Tag Spaces

+Tags are embedded in HTTP URIs in a well-defined manner so that the tag +embedded in an HTTP URI can be mechanically extracted from that URI. +Specifically, the last segment of the path portion of the URI (after the + final "/" character) contains the tag value. For example, the URI
http://www.example.com/tags/foo
contains the tag "foo". +

Thus, for the purposes of comparing two HTTP URIs as tags, the last +segment of the path portion should be extracted and only that value +(that value of the tag) should be compared. +

Need more formal language about comparison and extraction process. +

The destination of a rel="tag" hyperlink is required to be a tag +space (a place that collates or defines tags), where the last segment of + the path of the URL is the tag, e.g. +

+
http://technorati.com/tag/tech
+

is a URL for the tag "tech". +

Tags may only be placed in the URL path, and only in the last +segment of the path. Tags may not be placed in query parameters or +fragment identifiers. e.g. +

+
http://technorati.com/tag/tech?tag=fish#emu
+

is still a URL for the tag "tech", not "fish" or "emu". +

Since the only part of a tag space URL of which any structure is +required is the last path segment, a tag space URL can be hosted at any +domain. Authors may choose to link to a tag at a particular tag space +in order to provide a specific meaning. E.g. a tag for technology could + link to: +

+
http://en.wikipedia.org/wiki/Technology
+

Trailing slashes in tag URLs are ignored, that is: +

+
http://technorati.com/tag/Technology/
+

as a rel-tag URL is treated as: +

+
http://technorati.com/tag/Technology
+

Encoding issues

+

Spaces can be encoded either as + or %20. Unicode characters are encoded as specified in RFC 3986. For example: +

+
<a href="http://technorati.com/tag/Sant%C3%A9+et+bien-%C3%AAtre" rel="tag">Santé et bien-être</a>
+

Note that if using Wikipedia as a tagspace, as discussed above, you should use %20 as they remap '+' to %2B, causing a page with a plus sign in the title (which usually does not exist) to appear. +

+

Tags Are Visible Metadata

+

rel="tag" hyperlinks are intended to be visible links on + pages and posts. This is in stark contrast to meta keywords (which +were invisible and typically never revealed to readers), and thus is at +least somewhat more resilient to the problems which plagued meta +keywords. +

Making tag hyperlinks visible has the additional benefit of +making it more obvious to readers if a page is abusing tag links, and +thus providing more peer pressure for better behavior. It also makes it + more obvious to authors, who may not always be aware what invisible +metadata is being generated on their behalf. +

As a result the invisible tag link syntax variant: <link rel="tag" href="..." /> SHOULD NOT be supported by implementations. +

+

Examples in the wild

+

This section is informative. The number of rel-tag examples in + the wild has expanded far beyond the capacity of being kept inline in +this specification. They have been moved to a separate page. +

See rel-tag Examples in the wild. +

+

Implementations

+

This section is informative. +

The following implementations have been developed which either +generate or parse rel-tag links. If you have a rel-tag implementation, +please add it to the top of this list. Once the list grows too big, we'll make a separate wiki page like rel-tag-implementations. +

+
  • Degoli + is a simple experimental semantic web seach engine which is crawling +rel-tag data. It orders search results by rel-tag data and displays a +navigation tree containing relationships among tags. +
  • b2evolution A blog platform which generates rel-tag for all tags on blog posts (starting from v3.x). +
  • Textcube.org makes a blogging tool, Textcube which generates rel-tag for tags on blog posts. +
  • HashTags tags Twitter posts, converting "#example" into a tag of "example" +
  • ikiwiki generates rel-tag for all tagged pages (as of version 2.6) +
  • HubTag helps users find a unique tag across the web for their event - eg. JohnAndBettysWeddingWimbledonMarch07 +
  • Necctar + is a search engine exclusively based on tag microformat parsing and +processing. Necctar uses tags entered by bloggers to index the world +wide web +
  • Nutch has a rel-tag parser committed to their svn repository. +
  • Dreamweaver Extension suite from the Web Standards Project enables rel-tagging from within Dreamweaver 8. +
  • Scooch + slide show creator allows authors to generate rel-tags and group slide +shows by rel-tag via a list or cloud with tag usage count. +
  • The Freetag plugin for the Serendipity + blog software supports rel-tags on a per-entry basis, as well as inside + of its tag clouds. (The Freetag plugin is available inside of +SPARTACUS) +
  • pnh_mf is a plugin for Textpattern that supports embedding rel-tags and other microformats in templates and blog posts. Written by Chris Casciano. +
  • tru_tags is a plugin for Textpattern that supports rel-tagging blog posts via the Keywords field. +
  • ClothesOnline uses rel-tag for categorizing shops and brands, for example: Canada Goose. +
  • LiveJournal - see also their FAQ regarding their tags support +
  • TagsLinks Turn each tag into links that let you find related content on tagging services. +
  • OctoFinder uses rel-tag for all live news tag clouds. +
  • Tag plugin for WordPress +
    • Note that some sites using WordPress (http://microformatique.com/ for instance) are getting incorrect tags. The tag is ?cat=12 instead of the actual tag value. +
    +
  • Tag plugin for Blosxom +
  • Technorati first implemented rel-tag in its Technorati Tags service. Technorati indexes rel-tag tags. +
  • Greasemonkey script for Firefox that generates tags for Blogger +
  • rel-lint is a validation tool by Drew McLellan that will validate existence of rel-tag attributes. +
+

articles

+

This section is informative. +

Articles about rel-tag, most recent first. When this section gets too big, we can move it to rel-tag-articles. +

+ +

References

+

Normative References

+ +

Informative References

+ +

Discussions

+ +

Q&A

+
  • If you have any questions about rel-tag, check the rel FAQ first for general rel attribute questions, then check the rel-tag FAQ, and then if you don't find answers, ask your question on the microformats-discuss mailing list. +
+

Related pages

+ +

The rel-tag specification is a work in progress. As additional +aspects are discussed, understood, and written, they will be added. +These thoughts, issues, and questions are kept in separate pages. +

+ + + + +
+ +

Categories

+ + +
+ rel="tag" + was last modified: + + + Monday, March 5th, 2012 +
+ +
+

Views

+ +
+ +
+ + + + + + + + + + + + \ No newline at end of file diff --git a/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java new file mode 100644 index 0000000..fa44ea8 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +// Nutch imports +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.HashSet; + +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.storage.WebPage.Field; +import org.apache.nutch.util.Bytes; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that adds tag + * field(s) to the document. + * + * @see + * http://www.microformats.org/wiki/rel-tag + * @author Jérôme Charron + */ +public class RelTagIndexingFilter implements IndexingFilter { + + private Configuration conf; + + private static final Collection FIELDS = new HashSet(); + + static { + FIELDS.add(WebPage.Field.BASE_URL); + FIELDS.add(WebPage.Field.METADATA); + } + + /** + * Gets all the fields for a given {@link WebPage} Many datastores need to + * setup the mapreduce job by specifying the fields needed. All extensions + * that work on WebPage are able to specify what fields they need. + */ + @Override + public Collection getFields() { + return FIELDS; + } + + /** + * Set the {@link Configuration} object + */ + public void setConf(Configuration conf) { + this.conf = conf; + } + + /** + * Get the {@link Configuration} object + */ + public Configuration getConf() { + return this.conf; + } + + /** + * The {@link RelTagIndexingFilter} filter object. + * + * @param doc + * The {@link NutchDocument} object + * @param url + * URL to be filtered for rel-tag's + * @param page + * {@link WebPage} object relative to the URL + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, String url, WebPage page) + throws IndexingException { + // Check if some Rel-Tags found, possibly put there by RelTagParser + ByteBuffer bb = page.getMetadata().get(new Utf8(RelTagParser.REL_TAG)); + + if (bb != null) { + String[] tags = Bytes.toString(bb).split("\t"); + for (int i = 0; i < tags.length; i++) { + doc.add("tag", tags[i]); + } + } + return doc; + } +} diff --git a/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java new file mode 100644 index 0000000..f71c5ab --- /dev/null +++ b/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java @@ -0,0 +1,178 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +// JDK imports +import java.net.URL; +import java.net.URLDecoder; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.ParseFilter; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.storage.WebPage.Field; +import org.apache.nutch.util.StringUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +/** + * Adds microformat rel-tags of document if found. + * + * @see + * http://www.microformats.org/wiki/rel-tag + * @author Jérôme Charron + */ +public class RelTagParser implements ParseFilter { + + public static final Logger LOG = LoggerFactory.getLogger(RelTagParser.class); + + public final static String REL_TAG = "Rel-Tag"; + + private Configuration conf = null; + + private static class Parser { + + Set tags = null; + + Parser(Node node) { + tags = new TreeSet(); + parse(node); + } + + Set getRelTags() { + return tags; + } + + void parse(Node node) { + if (node.getNodeType() == Node.ELEMENT_NODE) { + // Look for tag + if ("a".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node hrefNode = attrs.getNamedItem("href"); + // Checks that it contains a href attribute + if (hrefNode != null) { + Node relNode = attrs.getNamedItem("rel"); + // Checks that it contains a rel attribute too + if (relNode != null) { + // Finaly checks that rel=tag + if ("tag".equalsIgnoreCase(relNode.getNodeValue())) { + String tag = parseTag(hrefNode.getNodeValue()); + if (!StringUtil.isEmpty(tag)) { + if (!tags.contains(tag)) { + tags.add(tag); + LOG.debug("Adding tag: " + tag + " to tag set."); + } + } + } + } + } + } + } + + // Recurse + NodeList children = node.getChildNodes(); + for (int i = 0; children != null && i < children.getLength(); i++) { + parse(children.item(i)); + } + } + + private final static String parseTag(String url) { + String tag = null; + try { + URL u = new URL(url); + String path = u.getPath(); + tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), + "UTF-8"); + } catch (Exception e) { + // Malformed tag... + tag = null; + } + return tag; + } + } + + /** + * Set the {@link Configuration} object + */ + public void setConf(Configuration conf) { + this.conf = conf; + } + + /** + * Get the {@link Configuration} object + */ + public Configuration getConf() { + return this.conf; + } + + private static final Collection FIELDS = new HashSet(); + + static { + FIELDS.add(WebPage.Field.BASE_URL); + FIELDS.add(WebPage.Field.METADATA); + } + + /** + * Gets all the fields for a given {@link WebPage} Many datastores need to + * setup the mapreduce job by specifying the fields needed. All extensions + * that work on WebPage are able to specify what fields they need. + */ + @Override + public Collection getFields() { + return FIELDS; + } + + @Override + /** + * Scan the HTML document looking at possible rel-tags + * @param url URL of the {@link WebPage} to be parsed + * @param page {@link WebPage} object relative to the URL + * @param parse {@link Parse} object holding parse status + * @param metatags within the {@link NutchDocument} + * @param doc The {@link NutchDocument} object + * @return parse the actual {@link Parse} object + */ + public Parse filter(String url, WebPage page, Parse parse, + HTMLMetaTags metaTags, DocumentFragment doc) { + // Trying to find the document's rel-tags + Parser parser = new Parser(doc); + Set tags = parser.getRelTags(); + // can't store multiple values in page metadata -> separate by tabs + StringBuffer sb = new StringBuffer(); + Iterator iter = tags.iterator(); + while (iter.hasNext()) { + sb.append(iter.next()); + sb.append("\t"); + } + ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes()); + page.getMetadata().put(new Utf8(REL_TAG), bb); + return parse; + } +} diff --git a/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html b/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html new file mode 100644 index 0000000..bef5409 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html @@ -0,0 +1,8 @@ + + +

+A microformats Rel-Tag +Parser/Indexer/Querier plugin. +

+ + diff --git a/apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java b/apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java new file mode 100644 index 0000000..c21c4ad --- /dev/null +++ b/apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Test; + +import java.nio.ByteBuffer; + +import static org.junit.Assert.*; + +/** + * JUnit test case for {@link RelTagIndexingFilter} which simply asserts that a + * 'tag' field is obtained by the filter. + * + * @author lewismc + */ + +public class TestRelTagIndexingFilter { + + @Test + public void testRelTagFields() throws Exception { + Configuration conf = NutchConfiguration.create(); + RelTagIndexingFilter filter = new RelTagIndexingFilter(); + filter.setConf(conf); + assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + WebPage page = WebPage.newBuilder().build(); + byte[] bytes = new byte[10]; + ByteBuffer bbuf = ByteBuffer.wrap(bytes); + page.getMetadata().put(new Utf8(RelTagParser.REL_TAG), bbuf); + try { + filter.filter(doc, "http://nutch.apache.org/", page); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + assertNotNull(doc); + assertTrue("check for 'tag' field", doc.getFieldNames().contains("tag")); + } +} diff --git a/apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java b/apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java new file mode 100644 index 0000000..064b46b --- /dev/null +++ b/apache-nutch-2.3/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.util.MimeUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Test; + +import static org.junit.Assert.*; + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +import static org.junit.Assert.assertEquals; + +/** + * Junit test for {@link RelTagParser} based mainly John Xing's parser tests. We + * are not concerned with actual parse text within the sample file, instead we + * assert that the rel-tags we expect are found in the WebPage metadata. To + * check the parser is working as expected we unwrap the ByteBuffer obtained + * from metadata, the same type as we use in expected (String). So just the + * other way around as we wrapped the metadata value. + * + * @author lewismc + * + */ +public class TestRelTagParser { + + private String fileSeparator = System.getProperty("file.separator"); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/microformats-reltag/build.xml during plugin compilation. + private String sampleFile = "microformats_reltag_test.html"; + + // rel-tag's we expect to be extracted from page.getMetadata() + private String expectedRelTags = "Category:Specifications Category:rel-tag "; + + private Configuration conf; + + @Test + public void testRelTagParser() throws ParseException, ProtocolException, + IOException { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + @SuppressWarnings("unused") + Parse parse; + String urlString = "file:" + sampleDir + fileSeparator + sampleFile; + + File file = new File(sampleDir + fileSeparator + sampleFile); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + in.close(); + + WebPage page = WebPage.newBuilder().build(); + page.setBaseUrl(new Utf8(urlString)); + page.setContent(ByteBuffer.wrap(bytes)); + MimeUtil mimeutil = new MimeUtil(conf); + String mtype = mimeutil.getMimeType(file); + page.setContentType(new Utf8(mtype)); + parse = new ParseUtil(conf).parse(urlString, page); + // begin assertion for tests + ByteBuffer bbuf = page.getMetadata().get(new Utf8("Rel-Tag")); + byte[] byteArray = new byte[bbuf.remaining()]; + bbuf.get(byteArray); + String s = new String(byteArray); + // bbuf.flip(); + assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", + expectedRelTags, s); + } + +} \ No newline at end of file diff --git a/apache-nutch-2.3/src/plugin/nutch-extensionpoints/build.xml b/apache-nutch-2.3/src/plugin/nutch-extensionpoints/build.xml new file mode 100644 index 0000000..45eb815 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/nutch-extensionpoints/build.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/nutch-extensionpoints/ivy.xml b/apache-nutch-2.3/src/plugin/nutch-extensionpoints/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/nutch-extensionpoints/ivy.xml @@ -0,0 +1,41 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/nutch-extensionpoints/plugin.xml b/apache-nutch-2.3/src/plugin/nutch-extensionpoints/plugin.xml new file mode 100644 index 0000000..d567f82 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/nutch-extensionpoints/plugin.xml @@ -0,0 +1,62 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/parse-ext/build.xml b/apache-nutch-2.3/src/plugin/parse-ext/build.xml new file mode 100644 index 0000000..25552fa --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-ext/build.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/parse-ext/command b/apache-nutch-2.3/src/plugin/parse-ext/command new file mode 100644 index 0000000..f42c055 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-ext/command @@ -0,0 +1,24 @@ +#!/bin/bash +# +# Sample bash script as external command invoked by parse-ext plugin +# +# 20040701, John Xing + +set -e + +if [ $# -ne 1 ]; then + echo Usage:$0 mimeType >&2 + exit 1 +fi + +case $1 in +"application/vnd.nutch.example.cat") + cat + ;; +"application/vnd.nutch.example.md5sum") + md5sum + ;; +*) + echo "Can't parse mimeType $1" >&2 + exit 1 +esac diff --git a/apache-nutch-2.3/src/plugin/parse-ext/ivy.xml b/apache-nutch-2.3/src/plugin/parse-ext/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-ext/ivy.xml @@ -0,0 +1,41 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/parse-ext/plugin.xml b/apache-nutch-2.3/src/plugin/parse-ext/plugin.xml new file mode 100644 index 0000000..6819b36 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-ext/plugin.xml @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java b/apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java new file mode 100644 index 0000000..4264f6f --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java @@ -0,0 +1,177 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.ext; + +import org.apache.nutch.protocol.Content; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; + +import org.apache.nutch.util.CommandRunner; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Hashtable; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.nio.charset.Charset; + +/** + * A wrapper that invokes external command to do real parsing job. + * + * @author John Xing + */ + +public class ExtParser implements Parser { + + public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.ext"); + + static final int BUFFER_SIZE = 4096; + + static final int TIMEOUT_DEFAULT = 30; // in seconds + + // handy map from String contentType to String[] {command, timeoutString, encoding} + Hashtable TYPE_PARAMS_MAP = new Hashtable(); + + private Configuration conf; + + private boolean loaded = false; + + public ExtParser () { } + + public ParseResult getParse(Content content) { + + String contentType = content.getContentType(); + + String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType); + if (params == null) + return new ParseStatus(ParseStatus.FAILED, + "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf()); + + String command = params[0]; + int timeout = Integer.parseInt(params[1]); + String encoding = params[2]; + + if (LOG.isTraceEnabled()) { + LOG.trace("Use "+command+ " with timeout="+timeout+"secs"); + } + + String text = null; + String title = null; + + try { + + byte[] raw = content.getContent(); + + String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); + if (contentLength != null + && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, + "Content truncated at " + raw.length + +" bytes. Parser can't handle incomplete " + + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf()); + } + + ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); + ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4); + + CommandRunner cr = new CommandRunner(); + + cr.setCommand(command+ " " +contentType); + cr.setInputStream(new ByteArrayInputStream(raw)); + cr.setStdOutputStream(os); + cr.setStdErrorStream(es); + + cr.setTimeout(timeout); + + cr.evaluate(); + + if (cr.getExitValue() != 0) + return new ParseStatus(ParseStatus.FAILED, + "External command " + command + + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf()); + + text = os.toString(encoding); + + } catch (Exception e) { // run time exception + return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); + } + + if (text == null) + text = ""; + + if (title == null) + title = ""; + + // collect outlink + Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf()); + + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, content.getMetadata()); + return ParseResult.createParseResult(content.getUrl(), + new ParseImpl(text, parseData)); + } + + public void setConf(Configuration conf) { + this.conf = conf; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + "org.apache.nutch.parse.Parser").getExtensions(); + + String contentType, command, timeoutString, encoding; + + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + + // only look for extensions defined by plugin parse-ext + if (!extension.getDescriptor().getPluginId().equals("parse-ext")) + continue; + + contentType = extension.getAttribute("contentType"); + if (contentType == null || contentType.equals("")) + continue; + + command = extension.getAttribute("command"); + if (command == null || command.equals("")) + continue; + + // null encoding means default + encoding = extension.getAttribute("encoding"); + if (encoding == null) + encoding = Charset.defaultCharset().name(); + + timeoutString = extension.getAttribute("timeout"); + if (timeoutString == null || timeoutString.equals("")) + timeoutString = "" + TIMEOUT_DEFAULT; + + TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, encoding }); + } + } + + public Configuration getConf() { + return this.conf; + } +} diff --git a/apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java b/apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java new file mode 100644 index 0000000..04cf2d2 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse wrapper to run external command to do the parsing. + */ +package org.apache.nutch.parse.ext; diff --git a/apache-nutch-2.3/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java b/apache-nutch-2.3/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java new file mode 100644 index 0000000..7bfc377 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.ext; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.hadoop.io.Text; + +import org.junit.Test; +import static org.junit.Assert.*; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +/** + * Unit tests for ExtParser. + * First creates a temp file with fixed content, then fetch + * and parse it using external command 'cat' and 'md5sum' alternately + * for 10 times. Doing so also does a light stress test for class + * CommandRunner.java (as used in ExtParser.java). + * + * Warning: currently only do test on linux platform. + * + * @author John Xing + */ +public class TestExtParser { + private File tempFile = null; + private String urlString = null; + private Content content = null; + private Parse parse = null; + + private String expectedText = "nutch rocks nutch rocks nutch rocks"; + // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum + private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526"; + + public TestExtParser(String name) { + } + + protected void setUp() throws ProtocolException, IOException { + // prepare a temp file with expectedText as its content + // This system property is defined in ./src/plugin/build-plugin.xml + String path = System.getProperty("test.data"); + if (path != null) { + File tempDir = new File(path); + if (!tempDir.exists()) + tempDir.mkdir(); + tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir); + } else { + // otherwise in java.io.tmpdir + tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt"); + } + urlString = tempFile.toURL().toString(); + + FileOutputStream fos = new FileOutputStream(tempFile); + fos.write(expectedText.getBytes()); + fos.close(); + + // get nutch content + Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + protocol = null; + } + + protected void tearDown() { + // clean content + content = null; + + // clean temp file + //if (tempFile != null && tempFile.exists()) + // tempFile.delete(); + } + + @Test + public void testIt() throws ParseException { + String contentType; + + // now test only on linux platform + if (!System.getProperty("os.name").equalsIgnoreCase("linux")) { + System.err.println("Current OS is "+System.getProperty("os.name")+"."); + System.err.println("No test is run on OS other than linux."); + return; + } + + Configuration conf = NutchConfiguration.create(); + // loop alternately, total 10*2 times of invoking external command + for (int i=0; i<10; i++) { + // check external parser that does 'cat' + contentType = "application/vnd.nutch.example.cat"; + content.setContentType(contentType); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl()); + assertEquals(expectedText,parse.getText()); + + // check external parser that does 'md5sum' + contentType = "application/vnd.nutch.example.md5sum"; + content.setContentType(contentType); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl()); + assertTrue(parse.getText().startsWith(expectedMD5sum)); + } + } + +} diff --git a/apache-nutch-2.3/src/plugin/parse-html/build.xml b/apache-nutch-2.3/src/plugin/parse-html/build.xml new file mode 100644 index 0000000..a5b99b5 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/build.xml @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/parse-html/ivy.xml b/apache-nutch-2.3/src/plugin/parse-html/ivy.xml new file mode 100644 index 0000000..9443fdb --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/ivy.xml @@ -0,0 +1,42 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/parse-html/lib/tagsoup.LICENSE.txt b/apache-nutch-2.3/src/plugin/parse-html/lib/tagsoup.LICENSE.txt new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/lib/tagsoup.LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/apache-nutch-2.3/src/plugin/parse-html/plugin.xml b/apache-nutch-2.3/src/plugin/parse-html/plugin.xml new file mode 100644 index 0000000..e1dc457 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/plugin.xml @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java new file mode 100644 index 0000000..31b54da --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java @@ -0,0 +1,766 @@ +/* + * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to + * avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id: DOMBuilder.java,v 1.19 2004/02/25 13:07:51 aruny Exp $ + */ +package org.apache.nutch.parse.html; + +import java.util.Stack; + +import org.w3c.dom.Comment; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.Text; +import org.w3c.dom.CDATASection; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.ext.LexicalHandler; + +/** + * This class takes SAX events (in addition to some extra events that SAX + * doesn't handle yet) and adds the result to a document or document fragment. + */ +public class DOMBuilder implements ContentHandler, LexicalHandler { + + /** Root document */ + public Document m_doc; + + /** Current node */ + protected Node m_currentNode = null; + + /** First node of document fragment or null if not a DocumentFragment */ + public DocumentFragment m_docFrag = null; + + /** Vector of element nodes */ + protected Stack m_elemStack = new Stack(); + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param node + * Current node + */ + public DOMBuilder(Document doc, Node node) { + m_doc = doc; + m_currentNode = node; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param docFrag + * Document fragment + */ + public DOMBuilder(Document doc, DocumentFragment docFrag) { + m_doc = doc; + m_docFrag = docFrag; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document. + * + * @param doc + * Root document + */ + public DOMBuilder(Document doc) { + m_doc = doc; + } + + /** + * Get the root node of the DOM being created. This is either a Document or a + * DocumentFragment. + * + * @return The root document or document fragment if not null + */ + public Node getRootNode() { + return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; + } + + /** + * Get the node currently being processed. + * + * @return the current node being processed + */ + public Node getCurrentNode() { + return m_currentNode; + } + + /** + * Return null since there is no Writer for this class. + * + * @return null + */ + public java.io.Writer getWriter() { + return null; + } + + /** + * Append a node to the current container. + * + * @param newNode + * New node to append + */ + protected void append(Node newNode) throws org.xml.sax.SAXException { + + Node currentNode = m_currentNode; + + if (null != currentNode) { + currentNode.appendChild(newNode); + + // System.out.println(newNode.getNodeName()); + } else if (null != m_docFrag) { + m_docFrag.appendChild(newNode); + } else { + boolean ok = true; + short type = newNode.getNodeType(); + + if (type == Node.TEXT_NODE) { + String data = newNode.getNodeValue(); + + if ((null != data) && (data.trim().length() > 0)) { + throw new org.xml.sax.SAXException( + "Warning: can't output text before document element! Ignoring..."); + } + + ok = false; + } else if (type == Node.ELEMENT_NODE) { + if (m_doc.getDocumentElement() != null) { + throw new org.xml.sax.SAXException( + "Can't have more than one root on a DOM!"); + } + } + + if (ok) + m_doc.appendChild(newNode); + } + } + + /** + * Receive an object for locating the origin of SAX document events. + * + *

+ * SAX parsers are strongly encouraged (though not absolutely required) to + * supply a locator: if it does so, it must supply the locator to the + * application by invoking this method before invoking any of the other + * methods in the ContentHandler interface. + *

+ * + *

+ * The locator allows the application to determine the end position of any + * document-related event, even if the parser is not reporting an error. + * Typically, the application will use this information for reporting its own + * errors (such as character content that does not match an application's + * business rules). The information returned by the locator is probably not + * sufficient for use with a search engine. + *

+ * + *

+ * Note that the locator will return correct information only during the + * invocation of the events in this interface. The application should not + * attempt to use it at any other time. + *

+ * + * @param locator + * An object that can return the location of any SAX document event. + * @see org.xml.sax.Locator + */ + public void setDocumentLocator(Locator locator) { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of a document. + * + *

+ * The SAX parser will invoke this method only once, before any other methods + * in this interface or in DTDHandler (except for setDocumentLocator). + *

+ */ + public void startDocument() throws org.xml.sax.SAXException { + + // No action for the moment. + } + + /** + * Receive notification of the end of a document. + * + *

+ * The SAX parser will invoke this method only once, and it will be the last + * method invoked during the parse. The parser shall not invoke this method + * until it has either abandoned parsing (because of an unrecoverable error) + * or reached the end of input. + *

+ */ + public void endDocument() throws org.xml.sax.SAXException { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of an element. + * + *

+ * The Parser will invoke this method at the beginning of every element in the + * XML document; there will be a corresponding endElement() event for every + * startElement() event (even when the element is empty). All of the element's + * content will be reported, in order, before the corresponding endElement() + * event. + *

+ * + *

+ * If the element name has a namespace prefix, the prefix will still be + * attached. Note that the attribute list provided will contain only + * attributes with explicit values (specified or defaulted): #IMPLIED + * attributes will be omitted. + *

+ * + * + * @param ns + * The namespace of the node + * @param localName + * The local part of the qualified name + * @param name + * The element name. + * @param atts + * The attributes attached to the element, if any. + * @see #endElement + * @see org.xml.sax.Attributes + */ + public void startElement(String ns, String localName, String name, + Attributes atts) throws org.xml.sax.SAXException { + + Element elem; + + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. + if ((null == ns) || (ns.length() == 0)) + elem = m_doc.createElementNS(null, name); + else + elem = m_doc.createElementNS(ns, name); + + append(elem); + + try { + int nAtts = atts.getLength(); + + if (0 != nAtts) { + for (int i = 0; i < nAtts; i++) { + + // System.out.println("type " + atts.getType(i) + " name " + + // atts.getLocalName(i) ); + // First handle a possible ID attribute + if (atts.getType(i).equalsIgnoreCase("ID")) + setIDAttribute(atts.getValue(i), elem); + + String attrNS = atts.getURI(i); + + if ("".equals(attrNS)) + attrNS = null; // DOM represents no-namespace as null + + // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // Crimson won't let us set an xmlns: attribute on the DOM. + String attrQName = atts.getQName(i); + + // In SAX, xmlns: attributes have an empty namespace, while in DOM + // they should have the xmlns namespace + if (attrQName.startsWith("xmlns:")) + attrNS = "http://www.w3.org/2000/xmlns/"; + + // ALWAYS use the DOM Level 2 call! + elem.setAttributeNS(attrNS, attrQName, atts.getValue(i)); + } + } + + // append(elem); + + m_elemStack.push(elem); + + m_currentNode = elem; + + // append(elem); + } catch (java.lang.Exception de) { + // de.printStackTrace(); + throw new org.xml.sax.SAXException(de); + } + + } + + /** + * + * + * + * Receive notification of the end of an element. + * + *

+ * The SAX parser will invoke this method at the end of every element in the + * XML document; there will be a corresponding startElement() event for every + * endElement() event (even when the element is empty). + *

+ * + *

+ * If the element name has a namespace prefix, the prefix will still be + * attached to the name. + *

+ * + * + * @param ns + * the namespace of the element + * @param localName + * The local part of the qualified name of the element + * @param name + * The element name + */ + public void endElement(String ns, String localName, String name) + throws org.xml.sax.SAXException { + m_elemStack.pop(); + m_currentNode = m_elemStack.isEmpty() ? null : m_elemStack.peek(); + } + + /** + * Set an ID string to node association in the ID table. + * + * @param id + * The ID string. + * @param elem + * The associated ID. + */ + public void setIDAttribute(String id, Element elem) { + + // Do nothing. This method is meant to be overiden. + } + + /** + * Receive notification of character data. + * + *

+ * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + *

+ * + *

+ * The application must not attempt to read from the array outside of the + * specified range. + *

+ * + *

+ * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + *

+ * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void characters(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + if (m_inCData) { + cdata(ch, start, length); + + return; + } + + String s = new String(ch, start, length); + Node childNode; + childNode = m_currentNode != null ? m_currentNode.getLastChild() : null; + if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) { + ((Text) childNode).appendData(s); + } else { + Text text = m_doc.createTextNode(s); + append(text); + } + } + + /** + * If available, when the disable-output-escaping attribute is used, output + * raw text without escaping. A PI will be inserted in front of the node with + * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom". + * + * @param ch + * Array containing the characters + * @param start + * Index to start of characters in the array + * @param length + * Number of characters in the array + */ + public void charactersRaw(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createProcessingInstruction("xslt-next-is-raw", + "formatter-to-dom")); + append(m_doc.createTextNode(s)); + } + + /** + * Report the beginning of an entity. + * + * The start and end of the document entity are not reported. The start and + * end of the external DTD subset are reported using the pseudo-name "[dtd]". + * All other events must be properly nested within start/end entity events. + * + * @param name + * The name of the entity. If it is a parameter entity, the name will + * begin with '%'. + * @see #endEntity + * @see org.xml.sax.ext.DeclHandler#internalEntityDecl + * @see org.xml.sax.ext.DeclHandler#externalEntityDecl + */ + public void startEntity(String name) throws org.xml.sax.SAXException { + + // Almost certainly the wrong behavior... + // entityReference(name); + } + + /** + * Report the end of an entity. + * + * @param name + * The name of the entity that is ending. + * @see #startEntity + */ + public void endEntity(String name) throws org.xml.sax.SAXException { + } + + /** + * Receive notivication of a entityReference. + * + * @param name + * name of the entity reference + */ + public void entityReference(String name) throws org.xml.sax.SAXException { + append(m_doc.createEntityReference(name)); + } + + /** + * Receive notification of ignorable whitespace in element content. + * + *

+ * Validating Parsers must use this method to report each chunk of ignorable + * whitespace (see the W3C XML 1.0 recommendation, section 2.10): + * non-validating parsers may also use this method if they are capable of + * parsing and using content models. + *

+ * + *

+ * SAX parsers may return all contiguous whitespace in a single chunk, or they + * may split it into several chunks; however, all of the characters in any + * single event must come from the same external entity, so that the Locator + * provides useful information. + *

+ * + *

+ * The application must not attempt to read from the array outside of the + * specified range. + *

+ * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #characters + */ + public void ignorableWhitespace(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createTextNode(s)); + } + + /** + * Tell if the current node is outside the document element. + * + * @return true if the current node is outside the document element. + */ + private boolean isOutsideDocElem() { + return (null == m_docFrag) + && m_elemStack.size() == 0 + && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } + + /** + * Receive notification of a processing instruction. + * + *

+ * The Parser will invoke this method once for each processing instruction + * found: note that processing instructions may occur before or after the main + * document element. + *

+ * + *

+ * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) + * or a text declaration (XML 1.0, section 4.3.1) using this method. + *

+ * + * @param target + * The processing instruction target. + * @param data + * The processing instruction data, or null if none was supplied. + */ + public void processingInstruction(String target, String data) + throws org.xml.sax.SAXException { + append(m_doc.createProcessingInstruction(target, data)); + } + + /** + * Report an XML comment anywhere in the document. + * + * This callback will be used for comments inside or outside the document + * element, including comments in the external DTD subset (if read). + * + * @param ch + * An array holding the characters in the comment. + * @param start + * The starting position in the array. + * @param length + * The number of characters to use from the array. + */ + public void comment(char ch[], int start, int length) + throws org.xml.sax.SAXException { + // tagsoup sometimes submits invalid values here + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) + return; + append(m_doc.createComment(new String(ch, start, length))); + } + + /** Flag indicating that we are processing a CData section */ + protected boolean m_inCData = false; + + /** + * Report the start of a CDATA section. + * + * @see #endCDATA + */ + public void startCDATA() throws org.xml.sax.SAXException { + m_inCData = true; + append(m_doc.createCDATASection("")); + } + + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + public void endCDATA() throws org.xml.sax.SAXException { + m_inCData = false; + } + + /** + * Receive notification of cdata. + * + *

+ * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + *

+ * + *

+ * The application must not attempt to read from the array outside of the + * specified range. + *

+ * + *

+ * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + *

+ * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void cdata(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + // XXX ab@apache.org: modified from the original, to accomodate TagSoup. + Node n = m_currentNode.getLastChild(); + if (n instanceof CDATASection) + ((CDATASection) n).appendData(s); + else if (n instanceof Comment) + ((Comment) n).appendData(s); + } + + /** + * Report the start of DTD declarations, if any. + * + * Any declarations are assumed to be in the internal subset unless otherwise + * indicated. + * + * @param name + * The document type name. + * @param publicId + * The declared public identifier for the external DTD subset, or + * null if none was declared. + * @param systemId + * The declared system identifier for the external DTD subset, or + * null if none was declared. + * @see #endDTD + * @see #startEntity + */ + public void startDTD(String name, String publicId, String systemId) + throws org.xml.sax.SAXException { + + // Do nothing for now. + } + + /** + * Report the end of DTD declarations. + * + * @see #startDTD + */ + public void endDTD() throws org.xml.sax.SAXException { + + // Do nothing for now. + } + + /** + * Begin the scope of a prefix-URI Namespace mapping. + * + *

+ * The information from this event is not necessary for normal Namespace + * processing: the SAX XML reader will automatically replace prefixes for + * element and attribute names when the http://xml.org/sax/features/namespaces + * feature is true (the default). + *

+ * + *

+ * There are cases, however, when applications need to use prefixes in + * character data or in attribute values, where they cannot safely be expanded + * automatically; the start/endPrefixMapping event supplies the information to + * the application to expand prefixes in those contexts itself, if necessary. + *

+ * + *

+ * Note that start/endPrefixMapping events are not guaranteed to be properly + * nested relative to each-other: all startPrefixMapping events will occur + * before the corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, but their order + * is not guaranteed. + *

+ * + * @param prefix + * The Namespace prefix being declared. + * @param uri + * The Namespace URI the prefix is mapped to. + * @see #endPrefixMapping + * @see #startElement + */ + public void startPrefixMapping(String prefix, String uri) + throws org.xml.sax.SAXException { + + /* + * // Not sure if this is needed or wanted // Also, it fails in the stree. + * if((null != m_currentNode) && (m_currentNode.getNodeType() == + * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) && + * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname + * = "xmlns:"+prefix; + * + * Element elem = (Element)m_currentNode; String val = + * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null) + * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname, + * uri); } } + */ + } + + /** + * End the scope of a prefix-URI mapping. + * + *

+ * See startPrefixMapping for details. This event will always occur after the + * corresponding endElement event, but the order of endPrefixMapping events is + * not otherwise guaranteed. + *

+ * + * @param prefix + * The prefix that was being mapping. + * @see #startPrefixMapping + * @see #endElement + */ + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { + } + + /** + * Receive notification of a skipped entity. + * + *

+ * The Parser will invoke this method once for each entity skipped. + * Non-validating processors may skip entities if they have not seen the + * declarations (because, for example, the entity was declared in an external + * DTD subset). All processors may skip external entities, depending on the + * values of the http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities properties. + *

+ * + * @param name + * The name of the skipped entity. If it is a parameter entity, the + * name will begin with '%'. + */ + public void skippedEntity(String name) throws org.xml.sax.SAXException { + } +} diff --git a/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java new file mode 100644 index 0000000..3ba3716 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java @@ -0,0 +1,366 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.net.URL; +import java.net.MalformedURLException; +import java.util.Collection; +import java.util.ArrayList; +import java.util.HashMap; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.util.NodeWalker; +import org.apache.nutch.util.URLUtil; +import org.apache.hadoop.conf.Configuration; + +import org.w3c.dom.*; + +/** + * A collection of methods for extracting content from DOM trees. + * + * This class holds a few utility methods for pulling content out of DOM nodes, + * such as getOutlinks, getText, etc. + * + */ +public class DOMContentUtils { + + public static class LinkParams { + public String elName; + public String attrName; + public int childLen; + + public LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } + } + + private HashMap linkParams = new HashMap(); + + public DOMContentUtils(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + // forceTags is used to override configurable tag ignoring, later on + Collection forceTags = new ArrayList(1); + + linkParams.clear(); + linkParams.put("a", new LinkParams("a", "href", 1)); + linkParams.put("area", new LinkParams("area", "href", 0)); + if (conf.getBoolean("parser.html.form.use_action", true)) { + linkParams.put("form", new LinkParams("form", "action", 1)); + if (conf.get("parser.html.form.use_action") != null) + forceTags.add("form"); + } + linkParams.put("frame", new LinkParams("frame", "src", 0)); + linkParams.put("iframe", new LinkParams("iframe", "src", 0)); + linkParams.put("script", new LinkParams("script", "src", 0)); + linkParams.put("link", new LinkParams("link", "href", 0)); + linkParams.put("img", new LinkParams("img", "src", 0)); + + // remove unwanted link tags from the linkParams map + String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); + for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { + if (!forceTags.contains(ignoreTags[i])) + linkParams.remove(ignoreTags[i]); + } + } + + /** + * This method takes a {@link StringBuilder} and a DOM {@link Node}, and will + * append all the content text found beneath the DOM node to the + * StringBuilder. + * + *

+ * + * If abortOnNestedAnchors is true, DOM traversal will be aborted + * and the StringBuffer will not contain any text encountered + * after a nested anchor is found. + * + *

+ * + * @return true if nested anchors were found + */ + public boolean getText(StringBuilder sb, Node node, + boolean abortOnNestedAnchors) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + return true; + } + return false; + } + + /** + * This is a convinience method, equivalent to + * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * + */ + public void getText(StringBuilder sb, Node node) { + getText(sb, node, false); + } + + // returns true if abortOnNestedAnchors is true and we find nested + // anchors + private boolean getTextHelper(StringBuilder sb, Node node, + boolean abortOnNestedAnchors, int anchorDepth) { + boolean abort = false; + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("script".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if ("style".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { + anchorDepth++; + if (anchorDepth > 1) { + abort = true; + break; + } + } + if (nodeType == Node.COMMENT_NODE) { + walker.skipChildren(); + } + if (nodeType == Node.TEXT_NODE) { + // cleanup and trim the value + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + text = text.trim(); + if (text.length() > 0) { + if (sb.length() > 0) + sb.append(' '); + sb.append(text); + } + } + } + + return abort; + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append the content text found beneath the first title node to + * the StringBuffer. + * + * @return true if a title node was found, false otherwise + */ + public boolean getTitle(StringBuilder sb, Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return false; + } + + if (nodeType == Node.ELEMENT_NODE) { + if ("title".equalsIgnoreCase(nodeName)) { + getText(sb, currentNode); + return true; + } + } + } + + return false; + } + + /** If Node contains a BASE tag then it's HREF is returned. */ + public URL getBase(Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + // is this node a BASE tag? + if (nodeType == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return null; + } + + if ("base".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + if ("href".equalsIgnoreCase(attr.getNodeName())) { + try { + return new URL(attr.getNodeValue()); + } catch (MalformedURLException e) { + } + } + } + } + } + } + + // no. + return null; + } + + private boolean hasOnlyWhiteSpace(Node node) { + String val = node.getNodeValue(); + for (int i = 0; i < val.length(); i++) { + if (!Character.isWhitespace(val.charAt(i))) + return false; + } + return true; + } + + // this only covers a few cases of empty links that are symptomatic + // of nekohtml's DOM-fixup process... + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { + if (childLen == 0) { + // this has no inner structure + if (params.childLen == 0) + return false; + else + return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // single nested link + return true; + + } else if (childLen == 2) { + + Node c0 = children.item(0); + Node c1 = children.item(1); + + if ((c0.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c0.getNodeName())) + && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { + // single link followed by whitespace node + return true; + } + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { + // whitespace node followed by single link + return true; + } + + } else if (childLen == 3) { + Node c0 = children.item(0); + Node c1 = children.item(1); + Node c2 = children.item(2); + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2)) { + // single link surrounded by whitespace nodes + return true; + } + } + + return false; + } + + /** + * This method finds all anchors below the supplied DOM node, and + * creates appropriate {@link Outlink} records for each (relative to the + * supplied base URL), and adds them to the outlinks + * {@link ArrayList}. + * + *

+ * + * Links without inner structure (tags, text, etc) are discarded, as are links + * which contain only single nested links and empty text nodes (this is a + * common DOM-fixup artifact, at least with nekohtml). + */ + public void getOutlinks(URL base, ArrayList outlinks, Node node) { + + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + NodeList children = currentNode.getChildNodes(); + int childLen = (children != null) ? children.getLength() : 0; + + if (nodeType == Node.ELEMENT_NODE) { + + nodeName = nodeName.toLowerCase(); + LinkParams params = linkParams.get(nodeName); + if (params != null) { + if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { + + StringBuilder linkText = new StringBuilder(); + getText(linkText, currentNode, true); + + NamedNodeMap attrs = currentNode.getAttributes(); + String target = null; + boolean noFollow = false; + boolean post = false; + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); + } else if ("rel".equalsIgnoreCase(attrName) + && "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } else if ("method".equalsIgnoreCase(attrName) + && "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; + } + } + if (target != null && !noFollow && !post) + try { + + URL url = URLUtil.resolveURL(base, target); + outlinks.add(new Outlink(url.toString(), linkText.toString() + .trim())); + } catch (MalformedURLException e) { + // don't care + } + } + // this should not have any children, skip them + if (params.childLen == 0) + continue; + } + } + } + } + +} diff --git a/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java new file mode 100644 index 0000000..159aa76 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java @@ -0,0 +1,214 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.net.URL; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.w3c.dom.*; + +/** + * Class for parsing META Directives from DOM trees. This class handles + * specifically Robots META directives (all, none, nofollow, noindex), finding + * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are + * stored in a HTMLMetaTags instance. + */ +public class HTMLMetaProcessor { + + /** + * Utility class with indicators for the robots directives "noindex" and + * "nofollow", and HTTP-EQUIV/no-cache + */ + + /** + * Sets the indicators in robotsMeta to appropriate values, based + * on any META tags found under the given node. + */ + public static final void getMetaTags(HTMLMetaTags metaTags, Node node, + URL currURL) { + + metaTags.reset(); + getMetaTagsHelper(metaTags, node, currURL); + } + + private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node, + URL currURL) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(node.getNodeName())) { + // META tags should not be under body + return; + } + + if ("meta".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node nameNode = null; + Node equivNode = null; + Node contentNode = null; + // Retrieves name, http-equiv and content attribues + for (int i = 0; i < attrs.getLength(); i++) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName().toLowerCase(); + if (attrName.equals("name")) { + nameNode = attr; + } else if (attrName.equals("http-equiv")) { + equivNode = attr; + } else if (attrName.equals("content")) { + contentNode = attr; + } + } + + if (nameNode != null) { + if (contentNode != null) { + String name = nameNode.getNodeValue().toLowerCase(); + metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); + if ("robots".equals(name)) { + + if (contentNode != null) { + String directives = contentNode.getNodeValue().toLowerCase(); + int index = directives.indexOf("none"); + + if (index >= 0) { + metaTags.setNoIndex(); + metaTags.setNoFollow(); + } + + index = directives.indexOf("all"); + if (index >= 0) { + // do nothing... + } + + index = directives.indexOf("noindex"); + if (index >= 0) { + metaTags.setNoIndex(); + } + + index = directives.indexOf("nofollow"); + if (index >= 0) { + metaTags.setNoFollow(); + } + + index = directives.indexOf("noarchive"); + if (index >= 0) { + metaTags.setNoCache(); + } + } + + } // end if (name == robots) + } + } + + if (equivNode != null) { + if (contentNode != null) { + String name = equivNode.getNodeValue().toLowerCase(); + String content = contentNode.getNodeValue(); + metaTags.getHttpEquivTags().setProperty(name, content); + if ("pragma".equals(name)) { + content = content.toLowerCase(); + int index = content.indexOf("no-cache"); + if (index >= 0) + metaTags.setNoCache(); + } else if ("refresh".equals(name)) { + int idx = content.indexOf(';'); + String time = null; + if (idx == -1) { // just the refresh time + time = content; + } else + time = content.substring(0, idx); + try { + metaTags.setRefreshTime(Integer.parseInt(time)); + // skip this if we couldn't parse the time + metaTags.setRefresh(true); + } catch (Exception e) { + ; + } + URL refreshUrl = null; + if (metaTags.getRefresh() && idx != -1) { // set the URL + idx = content.toLowerCase().indexOf("url="); + if (idx == -1) { // assume a mis-formatted entry with just the + // url + idx = content.indexOf(';') + 1; + } else + idx += 4; + if (idx != -1) { + String url = content.substring(idx); + try { + refreshUrl = new URL(url); + } catch (Exception e) { + // XXX according to the spec, this has to be an absolute + // XXX url. However, many websites use relative URLs and + // XXX expect browsers to handle that. + // XXX Unfortunately, in some cases this may create a + // XXX infinitely recursive paths (a crawler trap)... + // if (!url.startsWith("/")) url = "/" + url; + try { + refreshUrl = new URL(currURL, url); + } catch (Exception e1) { + refreshUrl = null; + } + } + } + } + if (metaTags.getRefresh()) { + if (refreshUrl == null) { + // apparently only refresh time was present. set the URL + // to the same URL. + refreshUrl = currURL; + } + metaTags.setRefreshHref(refreshUrl); + } + } + } + } + + } else if ("base".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node hrefNode = attrs.getNamedItem("href"); + + if (hrefNode != null) { + String urlString = hrefNode.getNodeValue(); + + URL url = null; + try { + if (currURL == null) + url = new URL(urlString); + else + url = new URL(currURL, urlString); + } catch (Exception e) { + ; + } + + if (url != null) + metaTags.setBaseHref(url); + } + + } + + } + + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + getMetaTagsHelper(metaTags, children.item(i), currURL); + } + } + } + +} diff --git a/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java new file mode 100644 index 0000000..5102113 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -0,0 +1,388 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.avro.util.Utf8; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.html.dom.HTMLDocumentImpl; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.ParseFilters; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseStatusCodes; +import org.apache.nutch.parse.ParseStatusUtils; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.storage.ParseStatus; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.util.Bytes; +import org.apache.nutch.util.EncodingDetector; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.TableUtil; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.w3c.dom.DOMException; +import org.w3c.dom.DocumentFragment; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class HtmlParser implements Parser { + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.html"); + + // I used 1000 bytes at first, but found that some documents have + // meta tag well past the first 1000 bytes. + // (e.g. http://cn.promo.yahoo.com/customcare/music.html) + private static final int CHUNK_SIZE = 2000; + + // NUTCH-1006 Meta equiv with single quotes not accepted + private static Pattern metaPattern = Pattern.compile( + "]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", + Pattern.CASE_INSENSITIVE); + private static Pattern charsetPattern = Pattern.compile( + "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); + private static Pattern charsetPatternHTML5 = Pattern.compile( + "]*>", + Pattern.CASE_INSENSITIVE); + + private static Collection FIELDS = new HashSet(); + + static { + FIELDS.add(WebPage.Field.BASE_URL); + } + + private String parserImpl; + + /** + * Given a ByteBuffer representing an html file of an + * unknown encoding, read out 'charset' parameter in the meta tag + * from the first CHUNK_SIZE bytes. If there's no meta tag for + * Content-Type or no charset is specified, the content is checked for a + * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented + * character encodings (UTF-16 only). If no character set can be determined, + * null is returned.
+ * See also + * http://www.w3.org/International/questions/qa-html-encoding-declarations, + * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and + * http://www.w3.org/TR/REC-xml/#sec-guessing
+ * + * @param content + * ByteBuffer representation of an html file + */ + + private static String sniffCharacterEncoding(ByteBuffer content) { + int length = Math.min(content.remaining(), CHUNK_SIZE); + + // We don't care about non-ASCII parts so that it's sufficient + // to just inflate each byte to a 16-bit value by padding. + // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into + // {U+0041, U+0082, U+00B7}. + String str = ""; + try { + str = new String(content.array(), content.arrayOffset() + + content.position(), length, Charset.forName("ASCII").toString()); + } catch (UnsupportedEncodingException e) { + // code should never come here, but just in case... + return null; + } + + Matcher metaMatcher = metaPattern.matcher(str); + String encoding = null; + if (metaMatcher.find()) { + Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); + if (charsetMatcher.find()) + encoding = new String(charsetMatcher.group(1)); + } + if (encoding == null) { + // check for HTML5 meta charset + metaMatcher = charsetPatternHTML5.matcher(str); + if (metaMatcher.find()) { + encoding = new String(metaMatcher.group(1)); + } + } + if (encoding == null) { + // check for BOM + if (length >= 3 && content.get(0) == (byte) 0xEF + && content.get(1) == (byte) 0xBB && content.get(2) == (byte) 0xBF) { + encoding = "UTF-8"; + } else if (length >= 2) { + if (content.get(0) == (byte) 0xFF && content.get(1) == (byte) 0xFE) { + encoding = "UTF-16LE"; + } else if (content.get(0) == (byte) 0xFE + && content.get(1) == (byte) 0xFF) { + encoding = "UTF-16BE"; + } + } + } + + return encoding; + } + + private String defaultCharEncoding; + + private Configuration conf; + + private DOMContentUtils utils; + + private ParseFilters htmlParseFilters; + + private String cachingPolicy; + + public Parse getParse(String url, WebPage page) { + HTMLMetaTags metaTags = new HTMLMetaTags(); + + String baseUrl = TableUtil.toString(page.getBaseUrl()); + URL base; + try { + base = new URL(baseUrl); + } catch (MalformedURLException e) { + return ParseStatusUtils.getEmptyParse(e, getConf()); + } + + String text = ""; + String title = ""; + Outlink[] outlinks = new Outlink[0]; + + // parse the content + DocumentFragment root; + try { + ByteBuffer contentInOctets = page.getContent(); + InputSource input = new InputSource(new ByteArrayInputStream( + contentInOctets.array(), contentInOctets.arrayOffset() + + contentInOctets.position(), contentInOctets.remaining())); + + EncodingDetector detector = new EncodingDetector(conf); + detector.autoDetectClues(page, true); + detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed"); + String encoding = detector.guessEncoding(page, defaultCharEncoding); + + page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING), + ByteBuffer.wrap(Bytes.toBytes(encoding))); + page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION), + ByteBuffer.wrap(Bytes.toBytes(encoding))); + + input.setEncoding(encoding); + if (LOG.isTraceEnabled()) { + LOG.trace("Parsing..."); + } + root = parse(input); + } catch (IOException e) { + LOG.error("Failed with the following IOException: ", e); + return ParseStatusUtils.getEmptyParse(e, getConf()); + } catch (DOMException e) { + LOG.error("Failed with the following DOMException: ", e); + return ParseStatusUtils.getEmptyParse(e, getConf()); + } catch (SAXException e) { + LOG.error("Failed with the following SAXException: ", e); + return ParseStatusUtils.getEmptyParse(e, getConf()); + } catch (Exception e) { + LOG.error("Failed with the following Exception: ", e); + return ParseStatusUtils.getEmptyParse(e, getConf()); + } + + // get meta directives + HTMLMetaProcessor.getMetaTags(metaTags, root, base); + if (LOG.isTraceEnabled()) { + LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); + } + // check meta directives + if (!metaTags.getNoIndex()) { // okay to index + StringBuilder sb = new StringBuilder(); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting text..."); + } + utils.getText(sb, root); // extract text + text = sb.toString(); + sb.setLength(0); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting title..."); + } + utils.getTitle(sb, root); // extract title + title = sb.toString().trim(); + } + + if (!metaTags.getNoFollow()) { // okay to follow links + ArrayList l = new ArrayList(); // extract outlinks + URL baseTag = utils.getBase(root); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting links..."); + } + utils.getOutlinks(baseTag != null ? baseTag : base, l, root); + outlinks = l.toArray(new Outlink[l.size()]); + if (LOG.isTraceEnabled()) { + LOG.trace("found " + outlinks.length + " outlinks in " + url); + } + } + + ParseStatus status = ParseStatus.newBuilder().build(); + status.setMajorCode((int) ParseStatusCodes.SUCCESS); + if (metaTags.getRefresh()) { + status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT); + status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString())); + status.getArgs().add( + new Utf8(Integer.toString(metaTags.getRefreshTime()))); + } + + Parse parse = new Parse(text, title, outlinks, status); + parse = htmlParseFilters.filter(url, page, parse, metaTags, root); + + if (metaTags.getNoCache()) { // not okay to cache + page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY), + ByteBuffer.wrap(Bytes.toBytes(cachingPolicy))); + } + + return parse; + } + + private DocumentFragment parse(InputSource input) throws Exception { + if (parserImpl.equalsIgnoreCase("tagsoup")) + return parseTagSoup(input); + else + return parseNeko(input); + } + + private DocumentFragment parseTagSoup(InputSource input) throws Exception { + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + DocumentFragment frag = doc.createDocumentFragment(); + DOMBuilder builder = new DOMBuilder(doc, frag); + org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser(); + reader.setContentHandler(builder); + reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); + reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false); + reader + .setProperty("http://xml.org/sax/properties/lexical-handler", builder); + reader.parse(input); + return frag; + } + + private DocumentFragment parseNeko(InputSource input) throws Exception { + DOMFragmentParser parser = new DOMFragmentParser(); + try { + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + parser.setFeature("http://cyberneko.org/html/features/augmentations", + true); + parser.setProperty( + "http://cyberneko.org/html/properties/default-encoding", + defaultCharEncoding); + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/ignore-specified-charset", + true); + parser + .setFeature( + "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", + false); + parser.setFeature( + "http://cyberneko.org/html/features/balance-tags/document-fragment", + true); + parser.setFeature("http://cyberneko.org/html/features/report-errors", + LOG.isTraceEnabled()); + } catch (SAXException e) { + } + // convert Document to DocumentFragment + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + doc.setErrorChecking(false); + DocumentFragment res = doc.createDocumentFragment(); + DocumentFragment frag = doc.createDocumentFragment(); + parser.parse(input, frag); + res.appendChild(frag); + + try { + while (true) { + frag = doc.createDocumentFragment(); + parser.parse(input, frag); + if (!frag.hasChildNodes()) + break; + if (LOG.isInfoEnabled()) { + LOG.info(" - new frag, " + frag.getChildNodes().getLength() + + " nodes."); + } + res.appendChild(frag); + } + } catch (Exception x) { + LOG.error("Failed with the following Exception: ", x); + } + ; + return res; + } + + public void setConf(Configuration conf) { + this.conf = conf; + this.htmlParseFilters = new ParseFilters(getConf()); + this.parserImpl = getConf().get("parser.html.impl", "neko"); + this.defaultCharEncoding = getConf().get( + "parser.character.encoding.default", "windows-1252"); + this.utils = new DOMContentUtils(conf); + this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", + Nutch.CACHING_FORBIDDEN_CONTENT); + } + + public Configuration getConf() { + return this.conf; + } + + @Override + public Collection getFields() { + return FIELDS; + } + + public static void main(String[] args) throws Exception { + // LOG.setLevel(Level.FINE); + String name = args[0]; + String url = "file:" + name; + File file = new File(name); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + Configuration conf = NutchConfiguration.create(); + HtmlParser parser = new HtmlParser(); + parser.setConf(conf); + WebPage page = WebPage.newBuilder().build(); + page.setBaseUrl(new Utf8(url)); + page.setContent(ByteBuffer.wrap(bytes)); + page.setContentType(new Utf8("text/html")); + Parse parse = parser.getParse(url, page); + System.out.println("title: " + parse.getTitle()); + System.out.println("text: " + parse.getText()); + System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks())); + + } + +} diff --git a/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java new file mode 100644 index 0000000..cfef10c --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java @@ -0,0 +1,112 @@ +/* + * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer, + * XXX in order to avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id: XMLCharacterRecognizer.java,v 1.7 2004/02/17 04:21:14 minchau Exp $ + */ +package org.apache.nutch.parse.html; + +/** + * Class used to verify whether the specified ch conforms to the XML + * 1.0 definition of whitespace. + */ +public class XMLCharacterRecognizer { + + /** + * Returns whether the specified ch conforms to the XML 1.0 + * definition of whitespace. Refer to the definition of + * S for details. + * + * @param ch + * Character to check as XML whitespace. + * @return =true if ch is XML whitespace; otherwise =false. + */ + public static boolean isWhiteSpace(char ch) { + return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA); + } + + /** + * Tell if the string is whitespace. + * + * @param ch + * Character array to check as XML whitespace. + * @param start + * Start index of characters in the array + * @param length + * Number of characters in the array + * @return True if the characters in the array are XML whitespace; otherwise, + * false. + */ + public static boolean isWhiteSpace(char ch[], int start, int length) { + + int end = start + length; + + for (int s = start; s < end; s++) { + if (!isWhiteSpace(ch[s])) + return false; + } + + return true; + } + + /** + * Tell if the string is whitespace. + * + * @param buf + * StringBuffer to check as XML whitespace. + * @return True if characters in buffer are XML whitespace, false otherwise + */ + public static boolean isWhiteSpace(StringBuffer buf) { + + int n = buf.length(); + + for (int i = 0; i < n; i++) { + if (!isWhiteSpace(buf.charAt(i))) + return false; + } + + return true; + } + + /** + * Tell if the string is whitespace. + * + * @param s + * String to check as XML whitespace. + * @return True if characters in buffer are XML whitespace, false otherwise + */ + public static boolean isWhiteSpace(String s) { + + if (null != s) { + int n = s.length(); + + for (int i = 0; i < n; i++) { + if (!isWhiteSpace(s.charAt(i))) + return false; + } + } + + return true; + } + +} diff --git a/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html new file mode 100644 index 0000000..c650389 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html @@ -0,0 +1,5 @@ + + +

An HTML document parsing plugin.

This package relies on NekoHTML.

+ + diff --git a/apache-nutch-2.3/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/apache-nutch-2.3/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java new file mode 100644 index 0000000..5440ec7 --- /dev/null +++ b/apache-nutch-2.3/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java @@ -0,0 +1,339 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import org.apache.nutch.parse.Outlink; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.ByteArrayInputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import org.cyberneko.html.parsers.*; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +import org.junit.Before; +import org.junit.Test; +import static org.junit.Assert.*; + +/** + * Unit tests for DOMContentUtils. + */ +public class TestDOMContentUtils { + + private static final String[] testPages = { + new String(" title " + + " body " + + " anchor " + ""), + new String(" title " + + " body " + " home " + + "" + " " + " bots " + + ""), + new String(" " + " " + + " separate this " + " from this" + + "" + ""), + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ...
  • home
  • + //
  • 1
  • + //
  • 2
  • + new String(" my title " + + " body " + "" + + ""), + // test frameset link extraction. The invalid frame in the middle will be + // fixed to a third standalone frame. + new String(" my title " + + " " + "" + + "" + "" + + "" + "" + + "" + "" + "" + + "" + "" + ""), + // test and