Skip to content

Commit

Permalink
Imps: 1. add docker-compose.yaml for dependencies 2. improve Web UI
Browse files Browse the repository at this point in the history
Examples: 1. add EBayHarvesterStarter
  • Loading branch information
galaxyeye committed Oct 21, 2024
1 parent e63089c commit 418669f
Show file tree
Hide file tree
Showing 6 changed files with 532 additions and 86 deletions.
10 changes: 10 additions & 0 deletions docker/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
version: '3'

services:
mongodb:
container_name: 'mongodb'
image: 'mongo:latest'
environment:
- 'MONGO_INITDB_DATABASE=scent'
ports:
- '27017:27017'
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package ai.platon.exotic.examples.agents

import ai.platon.exotic.crawl.common.ExoticMLPaths
import ai.platon.exotic.crawl.common.VerboseCrawler1
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.urls.Hyperlink
import ai.platon.pulsar.crawl.common.url.ListenableHyperlink
import ai.platon.pulsar.crawl.filter.AbstractScopedUrlNormalizer
import java.nio.file.Files
import java.time.Instant

object EBayUrls {
val itemUrlPrefix = "https://www.ebay.com/itm/"

fun isProductPage(url: String): Boolean {
return url.startsWith(itemUrlPrefix)
}

fun normalizeProductUrl(url: String): String? {
if (!isProductPage(url)) {
return null
}

return url.substringBefore("?")
}
}

class EBayProductUrlNormalizer : AbstractScopedUrlNormalizer() {
override fun isRelevant(url: String, scope: String) = EBayUrls.isProductPage(url)

override fun normalize(url: String, scope: String) = EBayUrls.normalizeProductUrl(url)

fun normalize(link: Hyperlink, scope: String = ""): Hyperlink? {
val url = normalize(link.url) ?: return null
link.href = link.url
link.url = url
return link
}
}

/**
* Copy EBay product pages to the directory to wait for training.
* */
class EBayHarvesterStarter(
val args: String,
val projectId: String
) {
private val crawler = VerboseCrawler1()
private val session = crawler.session

private val datasetPath = ExoticMLPaths.datasetDir.resolve(projectId)
private val htmlBaseDir = datasetPath.resolve("html")

init {
Files.createDirectories(htmlBaseDir)
session.context.urlNormalizer.add(EBayProductUrlNormalizer())
}

fun collectListPageLinks(): List<Hyperlink> {
return session.loadDocument("https://www.ebay.com/b/Apple/bn_21819543").selectHyperlinks("a[href~=/b/]")
}

fun loadAllAndExportToEncode(portalUrls: List<String>) {
val options = session.options(args)
val itemOptions = options.createItemOptions()
val documents = session.loadDocuments(portalUrls, options)
val urlNormalizer = EBayProductUrlNormalizer()
val urls = documents.flatMap { it.selectHyperlinks(options.outLinkSelector) }
.mapNotNullTo(HashSet()) { urlNormalizer.normalize(it) }
.map { createListenableHyperlink(it, itemOptions.args) }

session.submitAll(urls)
session.context.await()

createInfoFile()
}

private fun createListenableHyperlink(link: Hyperlink, args: String): ListenableHyperlink {
val l = ListenableHyperlink(link.url, link.text, link.order, link.referrer, link.args, link.href)

l.args = "$args -parse"
l.event.loadEventHandlers.onHTMLDocumentParsed.addLast { page, document ->
val url = page.url
if (page.protocolStatus.isSuccess && EBayUrls.isProductPage(url)) {
val path = htmlBaseDir.resolve(AppPaths.fromUri(url, suffix = ".html"))
Files.writeString(path, document.outerHtml, Charsets.UTF_8)
}
}

return l
}

private fun createInfoFile() {
val path = datasetPath.resolve("htmlExportInfo.txt")
val info = """
buildTime: ${Instant.now()}
args: $args
""".trimIndent()
Files.writeString(path, info)
}
}

fun main() {
val portalUrls = listOf(
"https://www.ebay.com/b/Apple/bn_21819543",
"https://www.ebay.com/b/Dell/bn_21823255",
"https://www.ebay.com/b/HP-Laptops-and-Netbooks/177/bn_349568",
"https://www.ebay.com/b/Lenovo/bn_21829183",
"https://www.ebay.com/b/Microsoft/bn_21830663",
"https://www.ebay.com/b/Canon-Digital-Cameras/31388/bn_740",
"https://www.ebay.com/b/Nikon-Digital-Cameras/31388/bn_759",
"https://www.ebay.com/b/LG/bn_21829255",
"https://www.ebay.com/b/GoPro-Digital-Cameras/31388/bn_748",
"https://www.ebay.com/b/Cameras-Photo/625/bn_1865546",
"https://www.ebay.com/b/Video-Games-Consoles/1249/bn_1850232",
"https://www.ebay.com/b/Portable-Audio-Headphones/15052/bn_1642614",
"https://www.ebay.com/b/Cell-Phone-Displays/136699/bn_317614"
)

val projectId = "p1727773434"
val args = " -i 10d -ii 100d -tl 1000 -ol a[href*=/itm/] -component #mainContent -itemRequireSize 800000 "

val harvester = EBayHarvesterStarter(args, projectId)
harvester.loadAllAndExportToEncode(portalUrls)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
package ai.platon.exotic.crawl.common

import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.RequiredDirectory
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.StandardCopyOption

/**
* Paths for machine learning tasks.
*/
object ExoticMLPaths {

val logger: Logger = LoggerFactory.getLogger(ExoticMLPaths::class.java)

@RequiredDirectory
val baseDir: Path = AppPaths.getProcTmp("ml")

@RequiredDirectory
val datasetDir: Path = baseDir.resolve("dataset")

@RequiredDirectory
val taskBaseDir: Path = baseDir.resolve("tasks")

/**
* The pattern of unsupervised task directory names.
* A typical directory name: `p1723201624`.
* - p: project
* - 1723201624: seconds from epoch
* */
val projectBaseDirPattern = "p\\d+".toRegex()

@RequiredDirectory
val supervisedTaskDir: Path = taskBaseDir.resolve("supervised")

/**
* The directory to store unsupervised tasks.
* A program may generate unsupervised tasks in this directory and another program may process the tasks.
* */
@RequiredDirectory
val supervisedTaskProcessingDir = supervisedTaskDir.resolve("processing")
/**
* The directory to store unsupervised tasks that have been processed.
* */
@RequiredDirectory
val supervisedTaskProcessedDir = supervisedTaskDir.resolve("processed")
/**
* The directory to store unsupervised tasks that have been generated.
* */
@RequiredDirectory
val supervisedTaskResultBaseDir = supervisedTaskDir.resolve("result")

////////////////////////////////////////////////////////////////////////////
// Unsupervised task directories

/**
* The directory to store unsupervised tasks.
* A program may generate unsupervised tasks in this directory and another program may process the tasks.
* */
@RequiredDirectory
val unsupervisedTaskDir: Path = taskBaseDir.resolve("unsupervised")

/**
* The directory to store unsupervised tasks.
* A program may generate unsupervised tasks in this directory and another program may process the tasks.
* */
@RequiredDirectory
val unsupervisedTaskProcessingDir = unsupervisedTaskDir.resolve("processing")
/**
* The directory to store unsupervised tasks that have been processed.
* */
@RequiredDirectory
val unsupervisedTaskProcessedDir = unsupervisedTaskDir.resolve("processed")
/**
* The directory to store unsupervised tasks that have been generated.
* */
@RequiredDirectory
val unsupervisedTaskResultBaseDir = unsupervisedTaskDir.resolve("result")

////////////////////////////////////////////////////////////////////////////
// Prompt task directories

/**
* The directory to store prompts.
* A program may generate prompts in this directory and another program may process the prompts.
* */
@RequiredDirectory
val promptTaskBaseDir = taskBaseDir.resolve("prompts")
/**
* The directory to store prompts that are being processed.
* */
@RequiredDirectory
val promptTaskProcessingDir = promptTaskBaseDir.resolve("processing")
/**
* The directory to store prompts that have been processed.
* */
@RequiredDirectory
val promptTaskProcessedDir = promptTaskBaseDir.resolve("processed")
/**
* The directory to store prompts that have been generated.
* */
@RequiredDirectory
val promptTaskResultBaseDir = promptTaskBaseDir.resolve("result")
/**
* The directory to store prompts that have been generated.
* */
@RequiredDirectory
// val promptTaskResultCacheDir = promptTaskResultBaseDir.resolve("cache")
val promptTaskResultCacheDir = AppPaths.CACHE_DIR.resolve("prompts")
/**
* The pattern of prompt file names.
* A typical file name: `prompt.p1723201624.0.remarkable.txt`.
* - p1723201624: the project id
* - 0: the prediction in the clustering result
* - remarkable: a label for the prompt, it's optional
* - txt: the file extension
*
* Can be parsed into a PromptFile object.
*
* ```
* data class PromptFile(
* val clusteringProjectId: String,
* val prediction: String,
* val label: String,
* val extension: String
* )
* ```
*
* Explanation:
* - `prompt.`: Prefix indicating a prompt file.
* - `(p\\d+)`: Captures the clustering task ID, starting with 'p' followed by digits.
* - `(\\d+)`: Captures the cluster ID, consisting of one or more digits.
* - `(\.\w+)?`: A optional label.
* - `([a-zA-Z]+)`: Fixed file extension indicating the file format.
* */
val promptFileNamePattern = "prompt.(p\\d+)\\.(\\d+)\\.?(\\w+)?\\.([a-zA-Z]+)".toRegex()
/**
* The pattern of prompt answer file names.
* The answer is generated by the LLM.
* A typical file name: `prompt.p1723201624.0.excellent.answer.json`.
* - p1723201624: the project id
* - 0: the prediction in the clustering result
* - excellent: a label for the answer, it's optional
* - json: the file extension
*
* Can be parsed into a AnswerFile object.
*
* ```
* data class AnswerFile(
* val clusteringProjectId: String,
* val prediction: String,
* val label: String,
* val extension: String
* )
* ```
*
* Explanation:
* - `prompt.`: Prefix indicating a prompt file.
* - `(p\\d+)`: Captures the clustering task ID, starting with 'p' followed by digits.
* - `(\\d+)`: Captures the cluster ID, consisting of one or more digits.
* - `(\.\w+)?`: A optional label.
* - `([a-zA-Z]+)`: Fixed file extension indicating the file format.
* */
val answerFileNamePattern = "prompt.(p\\d+)\\.(\\d+)\\.?(\\w+)?\\.answer\\.([a-zA-Z]+)".toRegex()

/**
* The directory to store reports.
* A program may generate reports in this directory and another program may process the reports.
* */
@RequiredDirectory
val reportBaseDir = taskBaseDir.resolve("report")
/**
* The directory to store reports that are being processed.
* */
@RequiredDirectory
val reportProcessingDir = reportBaseDir.resolve("processing")
/**
* The directory to store reports that have been processed.
* */
@RequiredDirectory
val reportProcessedDir = reportBaseDir.resolve("processed")
/**
* The directory to store the result of report processing.
* */
@RequiredDirectory
val reportResultDir = reportBaseDir.resolve("result")
/**
* The pattern of report file names.
*
* The file name should have an extension.
* */
val reportFileNamePattern = ".+\\..+".toRegex()

init {
createDirectories()
}

/**
* Create directories if they do not exist.
*/
fun createDirectories() {
ExoticMLPaths::class.java.declaredFields
.filter { it.annotations.any { it is RequiredDirectory } }
.mapNotNull { it.get(AppPaths) as? Path }
.forEach { it.takeUnless { Files.exists(it) }?.let { Files.createDirectories(it) } }
}

fun copyToLearnUnsupervised(taskFile: Path) {
val path = unsupervisedTaskDir.resolve(taskFile.fileName)
Files.copy(taskFile, path, StandardCopyOption.REPLACE_EXISTING)
}

fun copyToPrompt(promptFile: Path) {
val path = promptTaskBaseDir.resolve(promptFile.fileName)
Files.copy(promptFile, path, StandardCopyOption.REPLACE_EXISTING)
}

fun copyToReport(reportFile: Path) {
val path = reportBaseDir.resolve(reportFile.fileName)
Files.copy(reportFile, path, StandardCopyOption.REPLACE_EXISTING)
}
}
6 changes: 3 additions & 3 deletions exotic-services/src/main/resources/templates/crawl/home.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

<div class="card my-3 rounded-0">
<div class="card-body">
<h2 class="text-center text-info">Never write another web scraper!</h2>
<h5 class="text-center text-info">Exotic automatically generates all the extract rules
and scrape web data completely and accurately at scale.</h5>
<h2 class="text-center text-info">An Army of AI Agents for Scalable Web Scraping</h2>
<h5 class="text-center text-info">This application shows how to schedule and manage web scraping tasks
with PulsarRPA</h5>
</div>
</div>

Expand Down
2 changes: 1 addition & 1 deletion exotic-services/src/main/resources/templates/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<head th:fragment="head(title, links, scripts)">

<!--/* Title will be replaced by their respective titles in the calling template. */-->
<title th:replace="${title}">Exotic</title>
<title th:replace="${title}">PlatonAI - An Army of AI Agents for Scalable Web Scraping</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<meta charset="utf-8">
<meta http-equiv="x-ua-compatible" content="ie=edge">
Expand Down
Loading

0 comments on commit 418669f

Please sign in to comment.