Skip to content

Commit

Permalink
Updating co-routines work, bug fixes in url parser.
Browse files Browse the repository at this point in the history
  • Loading branch information
brianmadden committed May 8, 2017
1 parent 62dc370 commit 73c073e
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 20 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![Release](https://jitpack.io/v/brianmadden/krawler.svg)]
(https://jitpack.io/#brianmadden/krawler) [![Build Status](https://travis-ci.org/brianmadden/krawler.svg?branch=master)](https://travis-ci.org/brianmadden/krawler)
[![Release](https://jitpack.io/v/brianmadden/krawler.svg)](https://jitpack.io/#brianmadden/krawler)
[![Build Status](https://travis-ci.org/brianmadden/krawler.svg?branch=master)](https://travis-ci.org/brianmadden/krawler)

About
=====
Expand Down
7 changes: 3 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
group 'io.thelandscape'
version '0.2.2'
version '0.3.3'

buildscript {
ext.kotlin_version = '1.1.0'
ext.kotlin_version = '1.1.2-2'

repositories {
mavenCentral()
Expand Down Expand Up @@ -46,8 +46,7 @@ allprojects {
project(":") {
dependencies {
compile "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
compile "org.jetbrains.kotlin:kotlin-reflect:$kotlin_version"
compile 'org.jetbrains.kotlinx:kotlinx-coroutines-core:0.12'
compile 'org.jetbrains.kotlinx:kotlinx-coroutines-core:0.15'

compile "org.apache.httpcomponents:httpclient:4.5.2"
compile group: 'org.hsqldb', name: 'hsqldb', version: '2.3.4'
Expand Down
2 changes: 1 addition & 1 deletion example/src/main/kotlin/main.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import io.thelandscape.krawler.crawler.KrawlConfig

fun main(args: Array<String>) {

val config: KrawlConfig = KrawlConfig(totalPages = 100)
val config: KrawlConfig = KrawlConfig(totalPages = 1000)
val k = SimpleExample(config)

// Add a few different hosts to the whitelist
Expand Down
4 changes: 2 additions & 2 deletions gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#Sat Jan 21 23:35:44 PST 2017
#Sun Mar 05 13:10:52 PST 2017
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-2.13-all.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-2.13-bin.zip
51 changes: 41 additions & 10 deletions src/main/kotlin/io/thelandscape/krawler/crawler/Krawler.kt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import io.thelandscape.krawler.robots.RobotsConfig
import kotlinx.coroutines.experimental.*
import org.apache.logging.log4j.LogManager
import org.apache.logging.log4j.Logger
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicInteger

/**
Expand Down Expand Up @@ -68,6 +69,12 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
}

}

job.invokeOnCompletion {
logger.debug("Ending here... (job is no longer active)!!!!")
onCrawlEnd()
}

}

internal val scheduledQueue: ScheduledQueue = ScheduledQueue(krawlQueues!!, config)
Expand Down Expand Up @@ -190,9 +197,8 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
}

onCrawlStart()
(1..100).map { launch(CommonPool + job) { doCrawl() } }
while(job.isActive) { delay(250) }
onCrawlEnd()
(1..100).map { schedule() }
job.join()
}

/**
Expand Down Expand Up @@ -220,7 +226,7 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
}

onCrawlStart()
(1..100).map { launch(CommonPool + job) { doCrawl() } }
(1..100).map { schedule() }
}


Expand All @@ -242,8 +248,18 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
/**
* Private members
*/

internal fun schedule() = launch(CommonPool + job) {
try { doCrawl() }
catch(e: Throwable) {
logger.debug(e.printStackTrace())
visitCount.decrementAndGet()
}
}

internal val visitCount: AtomicInteger = AtomicInteger(0)
internal val finishedCount: AtomicInteger = AtomicInteger(0)
internal val wasShutdown: AtomicBoolean = AtomicBoolean(false)

// Set of redirect codes
private val redirectCodes: Set<Int> = setOf(300, 301, 302, 303, 307, 308)
Expand All @@ -257,14 +273,16 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),

// Make sure we're within depth limit
if (depth >= config.maxDepth && config.maxDepth != -1) {
launch(CommonPool + job) { doCrawl() }

logger.debug("Max depth!")
return
}

val history: KrawlHistoryEntry =
if (krawlHistory!!.hasBeenSeen(krawlUrl)) { // If it has been seen
onRepeatVisit(krawlUrl, parent)
launch(CommonPool + job) { doCrawl() }
schedule()
logger.debug("History says no")
return
} else {
krawlHistory!!.insert(krawlUrl)
Expand All @@ -277,24 +295,31 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
if (visit || check) {
// If we're respecting robots.txt check if it's ok to visit this page
if (config.respectRobotsTxt && !minder.isSafeToVisit(krawlUrl)) {
launch(CommonPool + job) { doCrawl() }
schedule()
logger.debug("Robots says no")
return
}

if ((visitCount.incrementAndGet() > config.totalPages) && (config.totalPages > -1))
if ((visitCount.incrementAndGet() > config.totalPages) && (config.totalPages > -1)) {
logger.debug("Max visit limit reached")
return
}

val doc: RequestResponse = requestProvider.getUrl(krawlUrl)

// If there was an error on trying to get the doc, call content fetch error
if (doc is ErrorResponse) {
onContentFetchError(krawlUrl, doc.reason)
schedule()
logger.debug("Content fetch error!")
return
}

// If there was an error parsing the response, still a content fetch error
if (doc !is KrawlDocument) {
onContentFetchError(krawlUrl, "Krawler was unable to parse the response from the server.")
schedule()
logger.debug("content fetch error2")
return
}

Expand All @@ -309,12 +334,18 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
check(krawlUrl, doc.statusCode)

if ((finishedCount.incrementAndGet() == config.totalPages) && (config.totalPages > -1)) {
job.cancel()
logger.debug("cancelling jobs")
if (!wasShutdown.getAndSet(true))
try {
job.cancel()
} catch (e: CancellationException) {
// Do nothing
}
return
}
}

launch(CommonPool + job) { doCrawl() }
schedule()
}

/**
Expand Down
3 changes: 2 additions & 1 deletion src/main/kotlin/io/thelandscape/krawler/http/KrawlUrl.kt
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ class KrawlUrl private constructor(url: String, parent: KrawlUrl?) {
idx++
continue
} else if (idx + 2 >= path.length) {
path.slice(idx + 1..path.length)
idx += 2
continue
} else {
path.slice(idx + 1..idx + 2)
}
Expand Down

0 comments on commit 73c073e

Please sign in to comment.