Skip to content

Commit

Permalink
2.0.x
Browse files Browse the repository at this point in the history
Upgrade pulsar to 2.0.x, scent to 2.0.x
  • Loading branch information
galaxyeye committed Oct 24, 2024
1 parent 7edadf8 commit c228cf6
Show file tree
Hide file tree
Showing 64 changed files with 241 additions and 646 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.12.2-SNAPSHOT
2.0.0-SNAPSHOT
2 changes: 1 addition & 1 deletion exotic-app/exotic-ML-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<parent>
<groupId>ai.platon.exotic</groupId>
<artifactId>exotic-app</artifactId>
<version>1.12.2-SNAPSHOT</version>
<version>2.0.0-SNAPSHOT</version>
</parent>

<artifactId>exotic-ML-examples</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ package ai.platon.exotic.examples.agents

import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.urls.Hyperlink
import ai.platon.pulsar.crawl.common.url.ListenableHyperlink
import ai.platon.pulsar.crawl.filter.AbstractScopedUrlNormalizer
import ai.platon.pulsar.skeleton.crawl.common.url.ListenableHyperlink
import ai.platon.pulsar.skeleton.crawl.filter.AbstractScopedUrlNormalizer
import ai.platon.scent.tools.VerboseCrawler
import java.nio.file.Files

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package ai.platon.exotic.examples.ml.supervised
import ai.platon.exotic.crawl.common.AmazonAsinUrlNormalizer
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.getLogger
import ai.platon.pulsar.dom.nodes.node.ext.isRegularText
import ai.platon.pulsar.persist.gora.generated.GWebPage
import ai.platon.scent.common.clearMLLabels
import ai.platon.scent.common.mlLabels
Expand Down Expand Up @@ -74,7 +75,10 @@ class AmazonMLRunner(

// val encodeOptions = EncodeOptions(labels, datasetPath, nGram = 1, nodeType = 1, textStrategy = 1)
val encodeOptions = EncodeOptions(datasetPath)
crawler.encodeElements(rootElements.asIterable(), encodeOptions)
// crawler.encodeElements(rootElements.asIterable(), encodeOptions)
session.encodeForElements(rootElements.asIterable(), encodeOptions) {
it.isRegularText
}

println("Dataset is exported | $datasetPath")
println("All done.")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package ai.platon.exotic.examples.ml.unsupervised.topEc.chinese.gome

import ai.platon.exotic.crawl.common.VerboseCrawler1
import ai.platon.scent.ScentEnvironment

fun main() {
val portalUrl = "https://list.gome.com.cn/cat10000092.html"
val args = "-i 10d -ii 500d -ol a[href~=item] -ignoreFailure"
ScentEnvironment().checkEnvironment()
val harvester = VerboseCrawler1()
harvester.harvest(portalUrl, args)
}
5 changes: 2 additions & 3 deletions exotic-app/exotic-OCR-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<parent>
<groupId>ai.platon.exotic</groupId>
<artifactId>exotic-app</artifactId>
<version>1.12.2-SNAPSHOT</version>
<version>2.0.0-SNAPSHOT</version>
</parent>

<artifactId>exotic-OCR-examples</artifactId>
Expand All @@ -41,8 +41,7 @@
<dependencies>
<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>pulsar-all</artifactId>
<version>${pulsar.version}</version>
<artifactId>pulsar-beans</artifactId>
</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ import ai.platon.exotic.examples.sites.food.dianping.TaskDef
import ai.platon.pulsar.common.AppContext
import ai.platon.pulsar.common.CheckState
import ai.platon.pulsar.common.getLogger
import ai.platon.pulsar.crawl.fetch.driver.AbstractWebDriver
import ai.platon.pulsar.crawl.fetch.driver.NavigateEntry
import ai.platon.pulsar.crawl.fetch.driver.WebDriver
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.skeleton.crawl.fetch.driver.AbstractWebDriver
import ai.platon.pulsar.skeleton.crawl.fetch.driver.NavigateEntry
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriver
import kotlinx.coroutines.delay
import java.time.Duration
import java.time.Instant
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package ai.platon.exotic.examples.sites
import ai.platon.exotic.examples.sites.food.dianping.DianpingCrawler
import ai.platon.pulsar.browser.common.BrowserSettings
import ai.platon.pulsar.common.config.CapabilityTypes
import ai.platon.pulsar.common.options.LoadOptions
import ai.platon.pulsar.skeleton.common.options.LoadOptions

/**
* java -jar exotic-OCR-examples*.jar -pc 8 -tab 10 -supervised -site walmart
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ package ai.platon.exotic.examples.sites.food.dianping

import ai.platon.pulsar.browser.common.BrowserSettings
import ai.platon.pulsar.common.ResourceLoader
import ai.platon.pulsar.common.options.LoadOptions
import ai.platon.pulsar.common.urls.UrlAware
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.context.support.AbstractPulsarContext
import ai.platon.pulsar.crawl.common.url.ParsableHyperlink
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.dom.select.selectHyperlinks
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.session.PulsarSession
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.context.support.AbstractPulsarContext
import ai.platon.pulsar.skeleton.crawl.common.url.ParsableHyperlink
import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.scent.context.ScentContexts
import java.time.Duration

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ import ai.platon.exotic.examples.sites.CommonRPA
import ai.platon.pulsar.common.HtmlIntegrity
import ai.platon.pulsar.common.brief
import ai.platon.pulsar.common.getLogger
import ai.platon.pulsar.common.message.MiscMessageWriter
import ai.platon.pulsar.common.options.LoadOptions
import ai.platon.pulsar.common.serialize.json.prettyPulsarObjectMapper
import ai.platon.pulsar.common.stringify
import ai.platon.pulsar.context.support.AbstractPulsarContext
import ai.platon.pulsar.crawl.CoreMetrics
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.persist.PageDatum
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.protocol.browser.emulator.BrowserResponseHandler
import ai.platon.pulsar.protocol.browser.emulator.util.HtmlIntegrityChecker
import ai.platon.pulsar.session.PulsarSession
import ai.platon.pulsar.skeleton.common.message.MiscMessageWriter
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.context.support.AbstractPulsarContext
import ai.platon.pulsar.skeleton.crawl.CoreMetrics
import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.scent.context.ScentContexts
import com.fasterxml.jackson.core.JsonProcessingException
import com.google.gson.GsonBuilder
Expand Down Expand Up @@ -71,34 +71,34 @@ class RestaurantRPA(
val options = session.options(args)

registerEventHandlers(options)
registerItemEventHandlers(options)
registeritemEventHandlers(options)

return options
}

private fun registerEventHandlers(options: LoadOptions) {
options.event.loadEvent.onHTMLDocumentParsed.addLast { _, document: FeaturedDocument ->
options.event.loadEventHandlers.onHTMLDocumentParsed.addLast { _, document: FeaturedDocument ->
collectPortalUrls(document) }
}

private fun registerItemEventHandlers(options: LoadOptions) {
private fun registeritemEventHandlers(options: LoadOptions) {
val ie = options.itemEvent

ie.loadEvent.onWillLoad.addLast {
ie.loadEventHandlers.onWillLoad.addLast {
it
}

ie.loadEvent.onWillFetch.addLast { page ->
ie.loadEventHandlers.onWillFetch.addLast { page ->
page.fetchRetries = 0
// page.maxRetries = 6
page.pageModel?.clear()
}

ie.loadEvent.onLoaded.addLast { page ->
ie.loadEventHandlers.onLoaded.addLast { page ->
dumpPageModel(page)
}

val be = ie.browseEvent
val be = ie.browseEventHandlers
be.onWillCheckDocumentState.addLast { page, driver ->
}

Expand Down Expand Up @@ -152,7 +152,7 @@ class RestaurantRPA(
}
}

ie.loadEvent.onHTMLDocumentParsed.addLast { page, document ->
ie.loadEventHandlers.onHTMLDocumentParsed.addLast { page, document ->
val fields = page.variables.variables
.filterKeys { it.startsWith(Screenshot.OCR) }
.mapValues { it.value.toString() }
Expand Down Expand Up @@ -197,7 +197,7 @@ class RestaurantRPA(
}

val pageModel = page.pageModel ?: return
val fieldGroups = pageModel.fieldGroups.map { it.name to it.fields }
val fieldGroups = pageModel.unboxedFieldGroups
if (fieldGroups.isEmpty()) {
return
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package ai.platon.exotic.examples.sites.food.dianping

import ai.platon.pulsar.common.*
import ai.platon.pulsar.crawl.fetch.driver.WebDriver
import ai.platon.pulsar.crawl.fetch.driver.WebDriverCancellationException
import ai.platon.pulsar.crawl.fetch.driver.WebDriverException
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriver
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriverCancellationException
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriverException
import net.sourceforge.tess4j.Tesseract
import net.sourceforge.tess4j.TesseractException
import org.apache.commons.lang3.StringUtils
Expand Down
8 changes: 4 additions & 4 deletions exotic-app/exotic-PERF-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<parent>
<groupId>ai.platon.exotic</groupId>
<artifactId>exotic-app</artifactId>
<version>1.12.2-SNAPSHOT</version>
<version>2.0.0-SNAPSHOT</version>
</parent>

<artifactId>exotic-PERF-examples</artifactId>
Expand All @@ -24,14 +24,14 @@
<dependencies>
<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>pulsar-all</artifactId>
<version>${pulsar.version}</version>
<artifactId>pulsar-beans</artifactId>

</dependency>

<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>pulsar-rest</artifactId>
<version>${pulsar.version}</version>

</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ import ai.platon.pulsar.browser.common.BrowserSettings
import ai.platon.pulsar.browser.common.InteractSettings
import ai.platon.pulsar.common.LinkExtractors
import ai.platon.pulsar.common.config.CapabilityTypes
import ai.platon.pulsar.common.metrics.MetricsSystem
import ai.platon.pulsar.common.proxy.ProxyPoolManager
import ai.platon.pulsar.context.PulsarContexts
import ai.platon.pulsar.crawl.common.url.ListenableHyperlink
import ai.platon.pulsar.skeleton.common.metrics.MetricsSystem
import ai.platon.pulsar.skeleton.context.PulsarContexts
import ai.platon.pulsar.skeleton.crawl.common.url.ListenableHyperlink
import org.springframework.boot.autoconfigure.SpringBootApplication
import org.springframework.boot.runApplication
import org.springframework.context.ApplicationContextInitializer
Expand Down Expand Up @@ -50,7 +50,7 @@ class HighPerformanceCrawler(
val links = LinkExtractors.fromResource(resource).asSequence()
.map { ListenableHyperlink(it, "", args = args) }
.onEach {
it.event.browseEvent.onWillNavigate.addLast { page, driver ->
it.event.browseEventHandlers.onWillNavigate.addLast { page, driver ->
// This is a temporary solution to override InteractSettings, will be improved in the future
page.setVar("InteractSettings", interactSettings)
driver.addBlockedURLs(blockingUrls)
Expand Down
6 changes: 3 additions & 3 deletions exotic-app/exotic-RPA-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<parent>
<groupId>ai.platon.exotic</groupId>
<artifactId>exotic-app</artifactId>
<version>1.12.2-SNAPSHOT</version>
<version>2.0.0-SNAPSHOT</version>
</parent>

<artifactId>exotic-RPA-examples</artifactId>
Expand All @@ -41,8 +41,8 @@
<dependencies>
<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>pulsar-all</artifactId>
<version>${pulsar.version}</version>
<artifactId>pulsar-beans</artifactId>

</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import ai.platon.pulsar.persist.PageDatum
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.protocol.browser.emulator.BrowserResponseHandler
import ai.platon.pulsar.protocol.browser.emulator.util.HtmlIntegrityChecker
import ai.platon.pulsar.session.PulsarSession
import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.scent.context.ScentContexts
import org.jsoup.nodes.Document
import kotlin.streams.toList
Expand Down Expand Up @@ -51,13 +51,13 @@ class JdRPA(

fun options(args: String): LoadOptions {
val options = session.options(args)
initItemItemEventHandler(options)
initItemitemEventHandler(options)
return options
}

private fun initItemItemEventHandler(options: LoadOptions) {
private fun initItemitemEventHandler(options: LoadOptions) {
val eh = options.itemEvent
val be = eh.browseEvent
val be = eh.browseEventHandler
// Warp up the browser to avoid being blocked by the server.
be.onBrowserLaunched.addLast { page, driver ->
driver.addBlockedURLs(blockedUrls)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.protocol.browser.driver.cdt.ChromeDevtoolsDriver
import ai.platon.pulsar.protocol.browser.emulator.BrowserResponseHandler
import ai.platon.pulsar.protocol.browser.emulator.util.HtmlIntegrityChecker
import ai.platon.pulsar.session.PulsarSession
import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.scent.context.ScentContexts
import org.jsoup.nodes.Document
import java.time.Duration
Expand Down Expand Up @@ -63,7 +63,7 @@ class WalmartRPA(
collectPortalUrls(document)
}

val be = options.itemEvent.browseEvent
val be = options.itemEvent.browseEventHandlers
be.onBrowserLaunched.addLast { page, driver ->
// Warp up the browser to avoid being blocked by the website
if (driver is ChromeDevtoolsDriver) {
Expand Down
4 changes: 2 additions & 2 deletions exotic-app/exotic-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<parent>
<groupId>ai.platon.exotic</groupId>
<artifactId>exotic-app</artifactId>
<version>1.12.2-SNAPSHOT</version>
<version>2.0.0-SNAPSHOT</version>
</parent>

<artifactId>exotic-examples</artifactId>
Expand All @@ -25,7 +25,7 @@
<dependency>
<groupId>ai.platon.pulsar</groupId>
<artifactId>pulsar-protocol</artifactId>
<version>${pulsar.version}</version>

</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package ai.platon.exotic.examples

import ai.platon.pulsar.context.PulsarContexts
import ai.platon.pulsar.skeleton.context.PulsarContexts

fun main() {
val session = PulsarContexts.createSession()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package ai.platon.exotic.examples

import ai.platon.pulsar.browser.common.BrowserSettings
import ai.platon.pulsar.context.PulsarContexts
import ai.platon.pulsar.skeleton.context.PulsarContexts

fun main() {
BrowserSettings.disableProxy().privacy(1).withSPA().withPrototypeBrowser()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package ai.platon.exotic.examples.sites.ec.patpat

import ai.platon.pulsar.context.PulsarContexts
import ai.platon.pulsar.skeleton.context.PulsarContexts

fun main() {
val portalUrl = "https://us.patpat.com/category/Baby.html"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package ai.platon.exotic.examples.sites.food.dianping

import ai.platon.pulsar.context.PulsarContexts
import ai.platon.pulsar.skeleton.context.PulsarContexts
import com.google.gson.GsonBuilder

fun main() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package ai.platon.exotic.examples.sites.language

import ai.platon.pulsar.context.PulsarContexts
import ai.platon.pulsar.crawl.event.impl.CloseMaskLayerHandler
import ai.platon.pulsar.skeleton.context.PulsarContexts
import ai.platon.pulsar.skeleton.crawl.event.impl.CloseMaskLayerHandler

fun main() {
val portalUrl = "https://shopee.co.th/กระเป๋าเป้ผู้ชาย-cat.49.1037.10297?page=1"
Expand All @@ -14,7 +14,7 @@ fun main() {
val options = session.options(args)

val closeMaskLayerHandler = CloseMaskLayerHandler(closeMaskLayerSelector)
options.event.browseEvent.onDocumentActuallyReady.addLast(closeMaskLayerHandler)
options.event.browseEventHandlers.onDocumentActuallyReady.addLast(closeMaskLayerHandler)

session.loadOutPages(portalUrl, options)
}
Loading

0 comments on commit c228cf6

Please sign in to comment.