Skip to content

Commit

Permalink
Merge pull request #227 from Yanrishatum/query_fixup_0
Browse files Browse the repository at this point in the history
Add support for various translators
  • Loading branch information
gmathi authored Jan 23, 2022
2 parents 52ed490 + 1b4038a commit 04b8460
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,11 @@ class GenericSelectorQueryCleaner(
applyCommands(subQuery, elements)
return@forEachIndexed
}
// SubQueryRole.RPage -> {}
SubqueryRole.RPage, SubqueryRole.RRealChapter, SubqueryRole.RChapterLink -> {
applyCommands(subQuery, elements)
elements.attr("data-role", subQuery.role.toString())
return@forEachIndexed
}
else -> {
}
}
Expand Down
113 changes: 111 additions & 2 deletions app/src/main/java/io/github/gmathi/novellibrary/cleaner/HtmlCleaner.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package io.github.gmathi.novellibrary.cleaner
import android.graphics.Bitmap
import android.graphics.Color
import android.net.Uri
import android.util.Log
import android.webkit.URLUtil
import androidx.core.graphics.alpha
import androidx.core.graphics.blue
Expand Down Expand Up @@ -86,6 +87,111 @@ open class HtmlCleaner protected constructor() {
}
""".trimIndent()
),
SelectorQuery(
".content-area", host = "a-t.nu", subQueries = listOf(
SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false),
SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = false, multiple = false),
SelectorSubQuery(".manga-discussion", SubqueryRole.RComments, optional = true, multiple = false),
// Contains css with text pseudo-elements.
SelectorSubQuery(".reading-content style", SubqueryRole.RWhitelist, optional = false, multiple = true),
SelectorSubQuery(".wp-community-credits", SubqueryRole.RBlacklist, optional = true, multiple = true),
), keepContentClasses = true, customCSS = """
*,*::before,*::after {
user-select: initial !important;
top: initial!important;
bottom: initial!important;
left: initial!important;
right: initial!important;
}
""".trimIndent()
),

// Make lazytranslations more bearable, ref -> https://lazytranslations.com/tl/oc/oc1/
SelectorQuery(".elementor-inner", host="lazytranslations.com", subQueries = listOf(
SelectorSubQuery(".entry-header h1.entry-title", SubqueryRole.RHeader, optional = false, multiple = false),
SelectorSubQuery("#innerbody,.elementor-text-editor", SubqueryRole.RContent, optional = false, multiple = false),
// Horrible abomination
SelectorSubQuery(".elementor-inner>.elementor-section:nth-child(3)", SubqueryRole.RNavigation, optional = true, multiple = false),
SelectorSubQuery("#innerbody>div>p>span[style*='color: #ffffff'],.elementor-text-editor div>p>span[style*='color: #ffffff'],.lazyt-announcement", SubqueryRole.RBlacklist, optional = true, multiple = true)
)),
SelectorQuery(".post-content", host="lazytranslations.com", subQueries = listOf(
SelectorSubQuery(".entry-header h1.entry-title", SubqueryRole.RHeader, optional = false, multiple = false),
SelectorSubQuery(".entry-content", SubqueryRole.RContent, optional = false, multiple = false),
SelectorSubQuery(".lazyt-announcement", SubqueryRole.RBlacklist, optional = true, multiple = true),
SelectorSubQuery(".post-content figure.wp-block-image>a", SubqueryRole.RRealChapter, optional = true, multiple = false),
SelectorSubQuery(".post-content figure.wp-block-image>a>img", SubqueryRole.RProcess, optional = true, multiple = false,
extraProcessing = listOf(
SubQueryProcessingCommandInfo(SubQueryProcessingCommand.AddAttribute, "alt=my image")
)
),
)),

// Scrambled fonts
SelectorQuery("div.entry-content", host = "secondlifetranslations.com", subQueries = listOf(
SelectorSubQuery(".entry-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false),
SelectorSubQuery("div.entry-content", SubqueryRole.RContent, optional = true, multiple = false),
), keepContentClasses = true, customCSS = """
@font-face {
font-family: 'open_sansscrambled';
src: url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.eot');
src: url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.eot?#iefix') format('embedded-opentype'),
url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.woff2') format('woff2'),
url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.woff') format('woff'),
url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.ttf') format('truetype'),
url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.svg#open_sansscrambled') format('svg');
font-weight: normal;
font-style: normal;
}
span.scrmbl {
font-family: 'open_sansscrambled' !important;
}
span.scrmbl .scrmbl-ent {
font-family: "Open Sans", sans-serif !important;
}
.scrmbl-ent {
visibility:hidden;
}
.scrmbl-disclaimer {
color: transparent;
height:1px;
margin:0;
padding:0;
overflow:hidden;
}
""".trimIndent()),

SelectorQuery(".reading-content", host="dragontea.ink", subQueries = listOf(
SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false),
SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = true, multiple = false),
), customCSS = """
@font-face {
font-family: 'DragonTea';
src: url(https://dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.eot);
src: url(https://dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.eot?#iefix) format('embedded-opentype'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.woff2) format('woff2'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.woff) format('woff'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.ttf) format('truetype'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.svg#DragonTea-Regular) format('svg');
font-weight: normal;
font-style: normal;
font-display: swap!important;
}
div[data-role=RContent] {
font-family: 'DragonTea'!important;
}
""".trimIndent()),

// They use annoying 2-page splitting: https://tigertranslations.org/2018/08/31/jack-of-all-trades-1/
SelectorQuery(".the-content", host="tigertranslations.org", subQueries = listOf(
SelectorSubQuery("#chapter-heading,.entry-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false),
SelectorSubQuery(".the-content", SubqueryRole.RContent, optional = true, multiple = false),
SelectorSubQuery("a:containsOwn(PAGE)", SubqueryRole.RPage, optional = true, multiple = true),
SelectorSubQuery("a:containsOwn(NEXT CHAPTER)", SubqueryRole.RChapterLink, optional = true, multiple = true),
SelectorSubQuery("$genericMetaSubquery,.post-meta-container,.taxonomies", SubqueryRole.RMeta, optional = true, multiple = true),
SelectorSubQuery("$genericShareSubquery, .jp-relatedposts, #jp-relatedposts", SubqueryRole.RShare, optional = true, multiple = true),
SelectorSubQuery(genericCommentsSubquery, SubqueryRole.RComments, optional = true, multiple = false),
)),

// Github, DIY Translations as an example
SelectorQuery("div#readme", host = "github.com"),
Expand Down Expand Up @@ -199,8 +305,10 @@ open class HtmlCleaner protected constructor() {
if ((it.host == null || url.contains(it.host)) && body.select(it.selector).isNotEmpty()) {
// Check non-optional subqueries to ensure we match the correct website.
// TODO: Optimise with running all queries at once and storing them, instead of rerunning them a second time inside cleaner
//if (it.host != null) Log.d(TAG, "${it.host}, ${it.selector}")
if (it.subQueries.isEmpty()) true
else it.subQueries.all { sub ->
//if (it.host != null) Log.d(TAG, "${sub.selector} -> ${sub.optional} : ${body.select(sub.selector).isNotEmpty()}")
sub.optional || body.select(sub.selector).isNotEmpty()
}
} else false
Expand Down Expand Up @@ -612,7 +720,8 @@ open class HtmlCleaner protected constructor() {
val otherLinks = contentElement?.select("a[href]")
otherLinks?.forEach {
// Other Share links
if (it.hasAttr("title") && it.attr("title").contains("Click to share", true)) {
if ((it.hasAttr("title") && it.attr("title").contains("Click to share", true)) ||
it.attr("data-role") == "RChapterLink") {
return@forEach
}

Expand All @@ -632,7 +741,7 @@ open class HtmlCleaner protected constructor() {
}
val isMainContent = genericMainContentUrlText.find { cmp -> cmp.equals(text, true) } != null ||
Regex("""Chapter \d+""", RegexOption.IGNORE_CASE).containsMatchIn(text) ||
it.attr("data-role") == "RBuffer"
it.attr("data-role") == "RBuffer" || it.attr("data-role") == "RRealChapter"
links.add(LinkedPage(linkedUrl, text, isMainContent))
}
} catch (e: Exception) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ enum class SubqueryRole {
*/
RPage,

/**
* URL to the real chapter. Because translators oh so love being annoying to their readers.
*/
RRealChapter,

/**
* URL to another (previous/next/specific) chapter. Chapters marked as such would be ignored by chapter merge option.
*/
RChapterLink,

/**
* Matches are explicitly removed from the cleaned chapter.
*/
Expand Down
3 changes: 3 additions & 0 deletions app/src/main/java/io/github/gmathi/novellibrary/util/Utils.kt
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,9 @@ object Utils {
content.select("[data-role=\"RHeader\"]").remove()
content.select("[data-role=\"RFooter\"]").remove()
content.select("[data-role=\"RNavigation\"]").remove()
content.select("[data-role=\"RPage\"]").remove()
content.select("[data-role=\"RChapterLink\"]").remove()
content.select("[data-role=\"RRealChapter\"]").remove()
content.select("select,input,button").remove()
body.children().remove()
if (!dataCenter.ttsPreferences.stripHeader) body.append(doc.title())
Expand Down

0 comments on commit 04b8460

Please sign in to comment.