-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Imps: 1. add docker-compose.yaml for dependencies 2. improve Web UI
Examples: 1. add EBayHarvesterStarter
- Loading branch information
Showing
6 changed files
with
532 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
version: '3' | ||
|
||
services: | ||
mongodb: | ||
container_name: 'mongodb' | ||
image: 'mongo:latest' | ||
environment: | ||
- 'MONGO_INITDB_DATABASE=scent' | ||
ports: | ||
- '27017:27017' |
125 changes: 125 additions & 0 deletions
125
.../exotic-ML-examples/src/main/kotlin/ai/platon/exotic/examples/agents/EBayTaskGenerator.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
package ai.platon.exotic.examples.agents | ||
|
||
import ai.platon.exotic.crawl.common.ExoticMLPaths | ||
import ai.platon.exotic.crawl.common.VerboseCrawler1 | ||
import ai.platon.pulsar.common.AppPaths | ||
import ai.platon.pulsar.common.urls.Hyperlink | ||
import ai.platon.pulsar.crawl.common.url.ListenableHyperlink | ||
import ai.platon.pulsar.crawl.filter.AbstractScopedUrlNormalizer | ||
import java.nio.file.Files | ||
import java.time.Instant | ||
|
||
object EBayUrls { | ||
val itemUrlPrefix = "https://www.ebay.com/itm/" | ||
|
||
fun isProductPage(url: String): Boolean { | ||
return url.startsWith(itemUrlPrefix) | ||
} | ||
|
||
fun normalizeProductUrl(url: String): String? { | ||
if (!isProductPage(url)) { | ||
return null | ||
} | ||
|
||
return url.substringBefore("?") | ||
} | ||
} | ||
|
||
class EBayProductUrlNormalizer : AbstractScopedUrlNormalizer() { | ||
override fun isRelevant(url: String, scope: String) = EBayUrls.isProductPage(url) | ||
|
||
override fun normalize(url: String, scope: String) = EBayUrls.normalizeProductUrl(url) | ||
|
||
fun normalize(link: Hyperlink, scope: String = ""): Hyperlink? { | ||
val url = normalize(link.url) ?: return null | ||
link.href = link.url | ||
link.url = url | ||
return link | ||
} | ||
} | ||
|
||
/** | ||
* Copy EBay product pages to the directory to wait for training. | ||
* */ | ||
class EBayHarvesterStarter( | ||
val args: String, | ||
val projectId: String | ||
) { | ||
private val crawler = VerboseCrawler1() | ||
private val session = crawler.session | ||
|
||
private val datasetPath = ExoticMLPaths.datasetDir.resolve(projectId) | ||
private val htmlBaseDir = datasetPath.resolve("html") | ||
|
||
init { | ||
Files.createDirectories(htmlBaseDir) | ||
session.context.urlNormalizer.add(EBayProductUrlNormalizer()) | ||
} | ||
|
||
fun collectListPageLinks(): List<Hyperlink> { | ||
return session.loadDocument("https://www.ebay.com/b/Apple/bn_21819543").selectHyperlinks("a[href~=/b/]") | ||
} | ||
|
||
fun loadAllAndExportToEncode(portalUrls: List<String>) { | ||
val options = session.options(args) | ||
val itemOptions = options.createItemOptions() | ||
val documents = session.loadDocuments(portalUrls, options) | ||
val urlNormalizer = EBayProductUrlNormalizer() | ||
val urls = documents.flatMap { it.selectHyperlinks(options.outLinkSelector) } | ||
.mapNotNullTo(HashSet()) { urlNormalizer.normalize(it) } | ||
.map { createListenableHyperlink(it, itemOptions.args) } | ||
|
||
session.submitAll(urls) | ||
session.context.await() | ||
|
||
createInfoFile() | ||
} | ||
|
||
private fun createListenableHyperlink(link: Hyperlink, args: String): ListenableHyperlink { | ||
val l = ListenableHyperlink(link.url, link.text, link.order, link.referrer, link.args, link.href) | ||
|
||
l.args = "$args -parse" | ||
l.event.loadEventHandlers.onHTMLDocumentParsed.addLast { page, document -> | ||
val url = page.url | ||
if (page.protocolStatus.isSuccess && EBayUrls.isProductPage(url)) { | ||
val path = htmlBaseDir.resolve(AppPaths.fromUri(url, suffix = ".html")) | ||
Files.writeString(path, document.outerHtml, Charsets.UTF_8) | ||
} | ||
} | ||
|
||
return l | ||
} | ||
|
||
private fun createInfoFile() { | ||
val path = datasetPath.resolve("htmlExportInfo.txt") | ||
val info = """ | ||
buildTime: ${Instant.now()} | ||
args: $args | ||
""".trimIndent() | ||
Files.writeString(path, info) | ||
} | ||
} | ||
|
||
fun main() { | ||
val portalUrls = listOf( | ||
"https://www.ebay.com/b/Apple/bn_21819543", | ||
"https://www.ebay.com/b/Dell/bn_21823255", | ||
"https://www.ebay.com/b/HP-Laptops-and-Netbooks/177/bn_349568", | ||
"https://www.ebay.com/b/Lenovo/bn_21829183", | ||
"https://www.ebay.com/b/Microsoft/bn_21830663", | ||
"https://www.ebay.com/b/Canon-Digital-Cameras/31388/bn_740", | ||
"https://www.ebay.com/b/Nikon-Digital-Cameras/31388/bn_759", | ||
"https://www.ebay.com/b/LG/bn_21829255", | ||
"https://www.ebay.com/b/GoPro-Digital-Cameras/31388/bn_748", | ||
"https://www.ebay.com/b/Cameras-Photo/625/bn_1865546", | ||
"https://www.ebay.com/b/Video-Games-Consoles/1249/bn_1850232", | ||
"https://www.ebay.com/b/Portable-Audio-Headphones/15052/bn_1642614", | ||
"https://www.ebay.com/b/Cell-Phone-Displays/136699/bn_317614" | ||
) | ||
|
||
val projectId = "p1727773434" | ||
val args = " -i 10d -ii 100d -tl 1000 -ol a[href*=/itm/] -component #mainContent -itemRequireSize 800000 " | ||
|
||
val harvester = EBayHarvesterStarter(args, projectId) | ||
harvester.loadAllAndExportToEncode(portalUrls) | ||
} |
224 changes: 224 additions & 0 deletions
224
exotic-crawl-common/src/main/kotlin/ai/platon/exotic/crawl/common/ExoticMLPaths.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
package ai.platon.exotic.crawl.common | ||
|
||
import ai.platon.pulsar.common.AppPaths | ||
import ai.platon.pulsar.common.RequiredDirectory | ||
import org.slf4j.Logger | ||
import org.slf4j.LoggerFactory | ||
import java.nio.file.Files | ||
import java.nio.file.Path | ||
import java.nio.file.StandardCopyOption | ||
|
||
/** | ||
* Paths for machine learning tasks. | ||
*/ | ||
object ExoticMLPaths { | ||
|
||
val logger: Logger = LoggerFactory.getLogger(ExoticMLPaths::class.java) | ||
|
||
@RequiredDirectory | ||
val baseDir: Path = AppPaths.getProcTmp("ml") | ||
|
||
@RequiredDirectory | ||
val datasetDir: Path = baseDir.resolve("dataset") | ||
|
||
@RequiredDirectory | ||
val taskBaseDir: Path = baseDir.resolve("tasks") | ||
|
||
/** | ||
* The pattern of unsupervised task directory names. | ||
* A typical directory name: `p1723201624`. | ||
* - p: project | ||
* - 1723201624: seconds from epoch | ||
* */ | ||
val projectBaseDirPattern = "p\\d+".toRegex() | ||
|
||
@RequiredDirectory | ||
val supervisedTaskDir: Path = taskBaseDir.resolve("supervised") | ||
|
||
/** | ||
* The directory to store unsupervised tasks. | ||
* A program may generate unsupervised tasks in this directory and another program may process the tasks. | ||
* */ | ||
@RequiredDirectory | ||
val supervisedTaskProcessingDir = supervisedTaskDir.resolve("processing") | ||
/** | ||
* The directory to store unsupervised tasks that have been processed. | ||
* */ | ||
@RequiredDirectory | ||
val supervisedTaskProcessedDir = supervisedTaskDir.resolve("processed") | ||
/** | ||
* The directory to store unsupervised tasks that have been generated. | ||
* */ | ||
@RequiredDirectory | ||
val supervisedTaskResultBaseDir = supervisedTaskDir.resolve("result") | ||
|
||
//////////////////////////////////////////////////////////////////////////// | ||
// Unsupervised task directories | ||
|
||
/** | ||
* The directory to store unsupervised tasks. | ||
* A program may generate unsupervised tasks in this directory and another program may process the tasks. | ||
* */ | ||
@RequiredDirectory | ||
val unsupervisedTaskDir: Path = taskBaseDir.resolve("unsupervised") | ||
|
||
/** | ||
* The directory to store unsupervised tasks. | ||
* A program may generate unsupervised tasks in this directory and another program may process the tasks. | ||
* */ | ||
@RequiredDirectory | ||
val unsupervisedTaskProcessingDir = unsupervisedTaskDir.resolve("processing") | ||
/** | ||
* The directory to store unsupervised tasks that have been processed. | ||
* */ | ||
@RequiredDirectory | ||
val unsupervisedTaskProcessedDir = unsupervisedTaskDir.resolve("processed") | ||
/** | ||
* The directory to store unsupervised tasks that have been generated. | ||
* */ | ||
@RequiredDirectory | ||
val unsupervisedTaskResultBaseDir = unsupervisedTaskDir.resolve("result") | ||
|
||
//////////////////////////////////////////////////////////////////////////// | ||
// Prompt task directories | ||
|
||
/** | ||
* The directory to store prompts. | ||
* A program may generate prompts in this directory and another program may process the prompts. | ||
* */ | ||
@RequiredDirectory | ||
val promptTaskBaseDir = taskBaseDir.resolve("prompts") | ||
/** | ||
* The directory to store prompts that are being processed. | ||
* */ | ||
@RequiredDirectory | ||
val promptTaskProcessingDir = promptTaskBaseDir.resolve("processing") | ||
/** | ||
* The directory to store prompts that have been processed. | ||
* */ | ||
@RequiredDirectory | ||
val promptTaskProcessedDir = promptTaskBaseDir.resolve("processed") | ||
/** | ||
* The directory to store prompts that have been generated. | ||
* */ | ||
@RequiredDirectory | ||
val promptTaskResultBaseDir = promptTaskBaseDir.resolve("result") | ||
/** | ||
* The directory to store prompts that have been generated. | ||
* */ | ||
@RequiredDirectory | ||
// val promptTaskResultCacheDir = promptTaskResultBaseDir.resolve("cache") | ||
val promptTaskResultCacheDir = AppPaths.CACHE_DIR.resolve("prompts") | ||
/** | ||
* The pattern of prompt file names. | ||
* A typical file name: `prompt.p1723201624.0.remarkable.txt`. | ||
* - p1723201624: the project id | ||
* - 0: the prediction in the clustering result | ||
* - remarkable: a label for the prompt, it's optional | ||
* - txt: the file extension | ||
* | ||
* Can be parsed into a PromptFile object. | ||
* | ||
* ``` | ||
* data class PromptFile( | ||
* val clusteringProjectId: String, | ||
* val prediction: String, | ||
* val label: String, | ||
* val extension: String | ||
* ) | ||
* ``` | ||
* | ||
* Explanation: | ||
* - `prompt.`: Prefix indicating a prompt file. | ||
* - `(p\\d+)`: Captures the clustering task ID, starting with 'p' followed by digits. | ||
* - `(\\d+)`: Captures the cluster ID, consisting of one or more digits. | ||
* - `(\.\w+)?`: A optional label. | ||
* - `([a-zA-Z]+)`: Fixed file extension indicating the file format. | ||
* */ | ||
val promptFileNamePattern = "prompt.(p\\d+)\\.(\\d+)\\.?(\\w+)?\\.([a-zA-Z]+)".toRegex() | ||
/** | ||
* The pattern of prompt answer file names. | ||
* The answer is generated by the LLM. | ||
* A typical file name: `prompt.p1723201624.0.excellent.answer.json`. | ||
* - p1723201624: the project id | ||
* - 0: the prediction in the clustering result | ||
* - excellent: a label for the answer, it's optional | ||
* - json: the file extension | ||
* | ||
* Can be parsed into a AnswerFile object. | ||
* | ||
* ``` | ||
* data class AnswerFile( | ||
* val clusteringProjectId: String, | ||
* val prediction: String, | ||
* val label: String, | ||
* val extension: String | ||
* ) | ||
* ``` | ||
* | ||
* Explanation: | ||
* - `prompt.`: Prefix indicating a prompt file. | ||
* - `(p\\d+)`: Captures the clustering task ID, starting with 'p' followed by digits. | ||
* - `(\\d+)`: Captures the cluster ID, consisting of one or more digits. | ||
* - `(\.\w+)?`: A optional label. | ||
* - `([a-zA-Z]+)`: Fixed file extension indicating the file format. | ||
* */ | ||
val answerFileNamePattern = "prompt.(p\\d+)\\.(\\d+)\\.?(\\w+)?\\.answer\\.([a-zA-Z]+)".toRegex() | ||
|
||
/** | ||
* The directory to store reports. | ||
* A program may generate reports in this directory and another program may process the reports. | ||
* */ | ||
@RequiredDirectory | ||
val reportBaseDir = taskBaseDir.resolve("report") | ||
/** | ||
* The directory to store reports that are being processed. | ||
* */ | ||
@RequiredDirectory | ||
val reportProcessingDir = reportBaseDir.resolve("processing") | ||
/** | ||
* The directory to store reports that have been processed. | ||
* */ | ||
@RequiredDirectory | ||
val reportProcessedDir = reportBaseDir.resolve("processed") | ||
/** | ||
* The directory to store the result of report processing. | ||
* */ | ||
@RequiredDirectory | ||
val reportResultDir = reportBaseDir.resolve("result") | ||
/** | ||
* The pattern of report file names. | ||
* | ||
* The file name should have an extension. | ||
* */ | ||
val reportFileNamePattern = ".+\\..+".toRegex() | ||
|
||
init { | ||
createDirectories() | ||
} | ||
|
||
/** | ||
* Create directories if they do not exist. | ||
*/ | ||
fun createDirectories() { | ||
ExoticMLPaths::class.java.declaredFields | ||
.filter { it.annotations.any { it is RequiredDirectory } } | ||
.mapNotNull { it.get(AppPaths) as? Path } | ||
.forEach { it.takeUnless { Files.exists(it) }?.let { Files.createDirectories(it) } } | ||
} | ||
|
||
fun copyToLearnUnsupervised(taskFile: Path) { | ||
val path = unsupervisedTaskDir.resolve(taskFile.fileName) | ||
Files.copy(taskFile, path, StandardCopyOption.REPLACE_EXISTING) | ||
} | ||
|
||
fun copyToPrompt(promptFile: Path) { | ||
val path = promptTaskBaseDir.resolve(promptFile.fileName) | ||
Files.copy(promptFile, path, StandardCopyOption.REPLACE_EXISTING) | ||
} | ||
|
||
fun copyToReport(reportFile: Path) { | ||
val path = reportBaseDir.resolve(reportFile.fileName) | ||
Files.copy(reportFile, path, StandardCopyOption.REPLACE_EXISTING) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.