Skip to content

Commit

Permalink
Fix: 1. correct gora-shaded-mongodb.version 2. correct StandaloneAppl…
Browse files Browse the repository at this point in the history
…ication.kt
  • Loading branch information
galaxyeye committed Oct 24, 2024
1 parent c228cf6 commit 0f93476
Show file tree
Hide file tree
Showing 13 changed files with 125 additions and 51 deletions.
8 changes: 7 additions & 1 deletion README-CN.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ Exotic (代表奇异之星 - Exotic Star) 是 PulsarRPA 的专业版,包含升
*#不用再写爬虫了。Exotic 从网站学习,自动生成所有提取规则,将 Web 当作数据库进行查询,完整精确地交付规模化的 Web 数据:#*

. 步骤1:使用高级人工智能自动提取网页中的每个字段,并生成提取 SQL
. 步骤2:测试 SQL,并在必要时改进它们以匹配前端业务需求
. 步骤3:在 Web 控制台中创建调度规则,以连续运行 SQL 并下载所有 Web 数据,从而推动您的业务向前发展

最受欢迎的网站已经有几十个 link:exotic-app/exotic-examples/src/main/kotlin/ai/platon/exotic/examples/sites/[采集案例],我们正在不断增加更多的案例。
Expand Down Expand Up @@ -42,6 +41,10 @@ Exotic (代表奇异之星 - Exotic Star) 是 PulsarRPA 的专业版,包含升
[source,bash]
----
wget http://static.platonic.fun/repo/ai/platon/exotic/exotic-standalone.jar
# start mongodb
docker-compose -f docker/docker-compose.yaml up
java -jar exotic-standalone.jar
java -jar exotic-standalone.jar harvest "https://www.amazon.com/b?node=1292115011" -diagnose -refresh
----

== 从源代码构建
Expand All @@ -66,6 +69,9 @@ git clone https://github.com/platonai/exotic.git
cd exotic
mvn clean && mvn
cd exotic-standalone/target/
# Don't forget to start MongoDB
docker-compose -f docker/docker-compose.yaml up
----
对于国内开发者,我们强烈建议您按照 link:https://github.com/platonai/pulsarr/blob/master/bin/tools/maven/maven-settings.adoc[这个] 指导来加速构建。

Expand Down
10 changes: 9 additions & 1 deletion README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,17 @@ There are already dozens of link:exotic-app/exotic-examples/src/main/kotlin/ai/p
* The latest version of the Java 11 JDK
* Java and jar on the PATH
* Google Chrome 90+
* MongoDB started

== Download
== Download & Run
Download the latest executable jar:
[source,bash]
----
wget http://static.platonic.fun/repo/ai/platon/exotic/exotic-standalone.jar
# start mongodb
docker-compose -f docker/docker-compose.yaml up
java -jar exotic-standalone.jar
java -jar exotic-standalone.jar harvest "https://www.amazon.com/b?node=1292115011" -diagnose -refresh
----

== Build from source
Expand All @@ -64,6 +69,9 @@ git clone https://github.com/platonai/exotic.git
cd exotic
mvn clean && mvn
cd exotic-standalone/target/
# Don't forget to start MongoDB
docker-compose -f docker/docker-compose.yaml up
----
For Chinese developers, we strongly suggest that you follow link:https://github.com/platonai/pulsarr/blob/master/bin/tools/maven/maven-settings.adoc[this] instruction to accelerate the building.

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package ai.platon.exotic.crawl.common

import ai.platon.pulsar.common.warnUnexpected
import ai.platon.pulsar.dom.Documents
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.dom.nodes.node.ext.isRegularText
import ai.platon.scent.ScentContext
Expand All @@ -9,14 +11,17 @@ import ai.platon.scent.dom.nodes.AnchorGroup
import ai.platon.scent.dom.nodes.node.ext.nthScreen
import ai.platon.scent.entities.HarvestResult
import ai.platon.scent.ml.EncodeOptions
import ai.platon.scent.ml.data.SimpleDataFrame
import ai.platon.scent.ml.encoding.EncodeProject
import ai.platon.scent.ql.h2.context.ScentSQLContext
import ai.platon.scent.ql.h2.context.ScentSQLContexts
import kotlinx.coroutines.runBlocking
import org.jsoup.nodes.Element
import org.slf4j.LoggerFactory
import java.nio.file.Files
import java.nio.file.Path
import java.util.*
import java.util.concurrent.atomic.AtomicBoolean
import kotlin.io.path.listDirectoryEntries
import kotlin.io.path.notExists

open class AdvancedVerboseCrawler(
context: ScentContext = ScentSQLContexts.create()
Expand Down Expand Up @@ -79,10 +84,12 @@ open class AdvancedVerboseCrawler(
documents: Iterable<FeaturedDocument>, encodeOptions: EncodeOptions
) = session.encodeDocuments(documents, encodeOptions) { it.isRegularText && it.nthScreen <= 2 }

fun harvest(url: String, args: String) = harvest(url, session.options(args))
fun harvest(portalUrl: String, args: String) = harvest(portalUrl, session.options(args))

fun harvest(url: String, options: HarvestOptions): HarvestResult {
val result = runBlocking { session.harvest(url, options) }
fun harvest(portalUrl: String, options: HarvestOptions): HarvestResult {
val result = runBlocking {
session.harvest(portalUrl, options)
}
report(result, options)
return result
}
Expand All @@ -97,6 +104,42 @@ open class AdvancedVerboseCrawler(
report(result, options)
return result
}

fun harvest(projectId: String, start: Int = 0, limit: Int = Int.MAX_VALUE) {
val args2 = "-projectId $projectId -diagnose -vj -trustSamples"
// val args2 = "$args -vj -trustSamples"
val options = session.options(args2)

val encodeProject = EncodeProject(projectId, EncodeProject.Type.TRAINING)
val documents = loadDocuments(encodeProject.htmlBaseDir, start, limit)

documents.chunked(200).forEach { chunk ->
harvest1(chunk.asSequence(), options)
}
}

private fun harvest1(document: Sequence<FeaturedDocument>, options: HarvestOptions) {
runCatching { harvest(document, options) }.onFailure { warnUnexpected(this, it) }
}

private fun loadDocuments(htmlBaseDir: Path, start: Int, limit: Int): Sequence<FeaturedDocument> {
val count = when {
htmlBaseDir.notExists() -> 0
else -> Files.list(htmlBaseDir).filter { it.fileName.toFile().endsWith("htm") }.count()
}
if (count < 20) {
logger.warn("Too few samples, might not generate a good result")
}

val documents = htmlBaseDir.listDirectoryEntries("*.htm")
.asSequence()
.drop(start)
.take(limit)
.map { Documents.parse(it, "UTF-8", it.toString()) }
.onEach { it.document.setBaseUri(it.normalizedURI ?: it.baseURI) }

return documents
}

override fun close() {
if (closed.compareAndSet(false, true)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,17 @@ import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.NetUtil
import ai.platon.pulsar.common.ProcessLauncher
import ai.platon.pulsar.common.browser.Browsers
import ai.platon.pulsar.common.stringify
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.context.PulsarContext
import ai.platon.pulsar.skeleton.context.PulsarContexts
import ai.platon.pulsar.common.warnUnexpected
import ai.platon.pulsar.skeleton.context.PulsarContext
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.context.PulsarContexts
import ai.platon.scent.context.ScentContexts
import ai.platon.scent.dom.HarvestOptions
import ai.platon.scent.entities.HarvestResult
import org.slf4j.LoggerFactory
import java.net.URL
import java.nio.file.Files
import java.util.concurrent.atomic.AtomicBoolean

open class VerboseCrawler(
Expand Down Expand Up @@ -89,19 +88,11 @@ open class VerboseCrawler(

fun report(result: HarvestResult, options: HarvestOptions) {
try {
session.buildAll(result.tableGroup, options)

val json = session.buildJson(result.tableGroup)
val path = AppPaths.REPORT_DIR.resolve("harvest/corpus/last-page-tables.json")
val baseDir = path.parent
Files.createDirectories(baseDir)
Files.writeString(path, json)

val exportedDocuments = session.buildAll(result.tableGroup, options)
val baseDir = exportedDocuments.keys.firstOrNull()?.parent ?: return
logger.info("Harvest result: file://$baseDir")

// openBrowser("$baseDir")
} catch (e: Exception) {
logger.warn(e.stringify("Failed to report harvest result - "))
warnUnexpected(this, e, "Failed to report harvest result")
}
}

Expand Down
12 changes: 6 additions & 6 deletions exotic-server/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@
<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>pulsar-ql</artifactId>
</dependency>

<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>gora-shaded-mongodb</artifactId>
<version>${gora-shaded-mongodb.version}</version>
</dependency>

<dependency>
<groupId>ai.platon.scent</groupId>
<artifactId>scent-boot</artifactId>
Expand All @@ -39,12 +45,6 @@
<artifactId>distributed-lock-mongo</artifactId>
<version>1.4.3</version>
</dependency>
<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>gora-shaded-mongodb</artifactId>
<version>0.8</version>
</dependency>

<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package ai.platon.exotic
package ai.platon.exotic.server

import ai.platon.exotic.handlers.AmazonHtmlIntegrityChecker
import ai.platon.exotic.handlers.JdHtmlIntegrityChecker
import ai.platon.exotic.server.handlers.AmazonHtmlIntegrityChecker
import ai.platon.exotic.server.handlers.JdHtmlIntegrityChecker
import ai.platon.pulsar.common.AppFiles
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.config.ImmutableConfig
Expand All @@ -10,6 +10,7 @@ import ai.platon.pulsar.protocol.browser.emulator.BrowserResponseHandler
import ai.platon.pulsar.skeleton.crawl.fetch.privacy.PrivacyContextMonitor
import ai.platon.scent.boot.autoconfigure.ScentContextInitializer
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import jakarta.annotation.PostConstruct
import org.h2.tools.Server
import org.springframework.boot.CommandLineRunner
import org.springframework.boot.autoconfigure.SpringBootApplication
Expand Down Expand Up @@ -52,12 +53,14 @@ class ExoticServerApplication(
* */
private val immutableConfig: ImmutableConfig
) {
@Bean
@PostConstruct
fun initBrowserResponseHandler() {
browserResponseHandler.emit(BrowserResponseEvents.initHTMLIntegrityChecker,
AmazonHtmlIntegrityChecker(immutableConfig))
AmazonHtmlIntegrityChecker(immutableConfig)
)
browserResponseHandler.emit(BrowserResponseEvents.initHTMLIntegrityChecker,
JdHtmlIntegrityChecker(immutableConfig))
JdHtmlIntegrityChecker(immutableConfig)
)
}

@Bean
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package ai.platon.exotic.handlers
package ai.platon.exotic.server.handlers

import ai.platon.pulsar.common.HtmlIntegrity
import ai.platon.pulsar.common.HtmlUtils
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package ai.platon.exotic.handlers
package ai.platon.exotic.server.handlers

import ai.platon.pulsar.common.HtmlIntegrity
import ai.platon.pulsar.common.config.ImmutableConfig
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package ai.platon.exotic.handlers
package ai.platon.exotic.server.handlers

import ai.platon.pulsar.common.HtmlIntegrity
import ai.platon.pulsar.common.HtmlUtils
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,10 @@ import org.springframework.scheduling.annotation.EnableScheduling
scanBasePackages = [
"ai.platon.scent.boot.autoconfigure",
"ai.platon.scent.rest.api",
"ai.platon.exotic.services.api"
"ai.platon.exotic.services.api",
"ai.platon.exotic.standalone.api",
]
)
@ComponentScan(
"ai.platon.scent.rest.api",
"ai.platon.exotic.services.api",
"ai.platon.exotic.standalone.api",
)
@EntityScan(
"ai.platon.exotic.driver.crawl.entity",
"ai.platon.exotic.services.entity",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@ import ai.platon.pulsar.common.sql.ResultSetFormatter
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.ql.common.ResultSets
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.crawl.common.URLUtil
import ai.platon.scent.boot.autoconfigure.ScentContextInitializer
import ai.platon.scent.dom.HarvestOptions
import ai.platon.scent.ml.encoding.EncodeProject
import ai.platon.scent.ql.h2.context.ScentSQLContexts
import com.google.gson.GsonBuilder
import kotlinx.coroutines.runBlocking
import org.springframework.boot.builder.SpringApplicationBuilder
import java.nio.file.Files
import java.nio.file.Paths
import java.sql.ResultSet
import kotlin.system.exitProcess

Expand Down Expand Up @@ -218,6 +222,24 @@ class ExoticExecutor(val argv: Array<String>) {
AdvancedVerboseCrawler().harvest(portalUrl, args)
}
}

internal fun harvest2() {
val (portalUrl, args) = UrlUtils.splitUrlArgs(configuredUrl)
if (!UrlUtils.isStandard(portalUrl)) {
System.err.println("The portal url is invalid")
return
}

val options = session.options(args)
val projectId = options.projectId

val crawler = AdvancedVerboseCrawler()
crawler.loadOutPages(portalUrl, args)
crawler.harvest(projectId)
// build views

// return the views path
}

internal fun executeSQL() {
val context = ScentSQLContexts.create()
Expand Down
20 changes: 12 additions & 8 deletions harvest.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ if ($JAVA_VERSION -eq $null) {
Write-Output "WARNING: Java 11 is required to run this program"
}

if ($args.Length -eq 0) {
$args1 = $args
if ($args1.Length -eq 0) {
Write-Output "Usage: .\harvest.ps1 <URL>"
Write-Output "For example: .\harvest.ps1 https://www.amazon.com/b?node=1292115011"
exit 0
# create an array
$args1 = @("https://www.amazon.com/b?node=1292115011")
}

$FILES=(Get-ChildItem -Path "$AppHome/exotic-standalone/target/" -Filter "exotic-standalone*.jar" -Recurse)
Expand All @@ -25,21 +27,23 @@ if ($FILE_COUNT -eq 0) {
mvn -DskipTests=true
}

$JAR=(Resolve-Path $FILES[0])
# Get the first file in $FILES
$JAR = $FILES | Select-Object -First 1
$JAR = $JAR.FullName

$URL = $args[0]
$args = $args[1..($args.Length - 1)]
$ARGS = "-diagnose -vj $($args -join ' ')"
$URL = $args1[0]
$args1 = $args1[1..($args1.Length - 1)]
$HARVEST_ARGS = "-diagnose -vj $($args1 -join ' ')"

# --add-opens java.base/java.time=ALL-UNNAMED to fix JEP 396: Strongly Encapsulate JDK Internals by Default,
# the problem appears when upgrading java from 11 to 17.
$JVM_OPTS = "--add-opens=java.base/java.time=ALL-UNNAMED"
# $JVM_OPTS = "--add-exports=java.naming/com.sun.jndi.ldap=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.security=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.management/javax.management=ALL-UNNAMED --add-opens=java.naming/javax.naming=ALL-UNNAMED"

# Write-Output "java $JVM_OPTS -jar $JAR harvest $URL $ARGS"
Write-Output "java $JVM_OPTS -jar $JAR harvest $URL $HARVEST_ARGS"

try {
java $JVM_OPTS -jar "$JAR" harvest $URL $ARGS
java $JVM_OPTS -jar "$JAR" harvest "$URL" $HARVEST_ARGS
} catch {
Write-Error "Failed to execute the Java application: $_"
}
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -290,5 +290,6 @@

<cron-utils.version>9.1.6</cron-utils.version>
<embed.mongo.version>4.17.0</embed.mongo.version>
<gora-shaded-mongodb.version>0.9</gora-shaded-mongodb.version>
</properties>
</project>

0 comments on commit 0f93476

Please sign in to comment.