Skip to content

Commit

Permalink
fix: do not process js comments (sourcerer-io#165) (sourcerer-io#196)
Browse files Browse the repository at this point in the history
* fix: do not process js comments (sourcerer-io#165)

* wip: fix pr

* fix: style
  • Loading branch information
yaronskaya authored and anatolystansler committed Feb 16, 2018
1 parent affed37 commit aea06a4
Show file tree
Hide file tree
Showing 14 changed files with 105 additions and 70 deletions.
9 changes: 5 additions & 4 deletions src/main/kotlin/app/extractors/CExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ class CExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val extractImportRegex =
Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -24,9 +28,8 @@ class CExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val regex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
fileContent.forEach {
val res = regex.find(it)
val res = extractImportRegex.find(it)
if (res != null) {
val lineLib = res.groupValues.last()
imports.add(lineLib)
Expand All @@ -37,8 +40,6 @@ class CExtractor : ExtractorInterface {
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
Expand Down
9 changes: 6 additions & 3 deletions src/main/kotlin/app/extractors/CSharpExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ class CSharpExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val extractImportRegex = Regex("""using\s+(\w+[.\w+]*)""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -25,9 +28,8 @@ class CSharpExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val regex = Regex("""using\s+(\w+[.\w+]*)""")
fileContent.forEach {
val res = regex.find(it)
val res = extractImportRegex.find(it)
if (res != null) {
val importedName = res.groupValues[1]
LIBRARIES.forEach { library ->
Expand All @@ -52,6 +54,7 @@ class CSharpExtractor : ExtractorInterface {
override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}
}
11 changes: 6 additions & 5 deletions src/main/kotlin/app/extractors/CppExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class CppExtractor : ExtractorInterface {
}
val MULTI_IMPORT_TO_LIB =
ExtractorInterface.getMultipleImportsToLibraryMap(LANGUAGE_NAME)
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val extractImportRegex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -26,9 +29,8 @@ class CppExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val regex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
fileContent.forEach {
val res = regex.find(it)
val res = extractImportRegex.find(it)
if (res != null) {
val lineLib = res.groupValues.last()
imports.add(lineLib)
Expand All @@ -40,8 +42,6 @@ class CppExtractor : ExtractorInterface {
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
Expand All @@ -50,6 +50,7 @@ class CppExtractor : ExtractorInterface {
override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}
}
6 changes: 3 additions & 3 deletions src/main/kotlin/app/extractors/ExtractorInterface.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ interface ExtractorInterface {
private val classifiersCache = hashMapOf<String, Classifier>()
private val modelsDir = "models"
private val pbExt = ".pb"
val stringRegex = Regex("""(".+?"|'.+?')""")
val splitRegex =
Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")

private fun getResource(path: String): InputStream {
return ExtractorInterface::class.java.classLoader
Expand Down Expand Up @@ -173,11 +176,8 @@ interface ExtractorInterface {
}

fun tokenize(line: String): List<String> {
val stringRegex = Regex("""(".+?"|'.+?')""")
val newLine = stringRegex.replace(line, "")
//TODO(lyaronskaya): multiline comment regex
val splitRegex =
Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")
val tokens = splitRegex.split(newLine)
.filter { it.isNotBlank() && !it.contains('"') && !it.contains('\'')
&& it != "-" && it != "@"}
Expand Down
16 changes: 9 additions & 7 deletions src/main/kotlin/app/extractors/GoExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ class GoExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val singleImportRegex = Regex("""import\s+"(\w+)"""")
val multipleImportRegex = Regex("""import[\s\t\n]+\((.+?)\)""",
RegexOption.DOT_MATCHES_ALL)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -24,31 +29,27 @@ class GoExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val singleImportRegex = Regex("""import\s+"(\w+)"""")
fileContent.forEach {
val res = singleImportRegex.find(it)
if (res != null) {
val lineLib = res.groupValues.last()
imports.add(lineLib)
}
}
val multipleImportRegex = Regex("""import[\s\t\n]+\((.+?)\)""",
RegexOption.DOT_MATCHES_ALL)
val contentJoined = fileContent.joinToString(separator = "")
multipleImportRegex.findAll(contentJoined).forEach { matchResult ->
imports.addAll(matchResult.groupValues.last()
.split(Regex("""(\t+|\n+|\s+|")"""))
.filter { it.isNotEmpty() }
.map { it -> it.replace("\"", "") }
.map { it -> if (it.contains("github.com")) it.split("/")[2] else it})
.map { it -> if (it.contains("github.com")) it.split("/")[2]
else it})
}

return imports.toList()
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
Expand All @@ -57,6 +58,7 @@ class GoExtractor : ExtractorInterface {
override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}
}
13 changes: 7 additions & 6 deletions src/main/kotlin/app/extractors/JavaExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ class JavaExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
val extractImportRegex = Regex("""import\s+(\w+[.\w+]*)""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand Down Expand Up @@ -60,9 +64,8 @@ class JavaExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val regex = Regex("""import\s+(\w+[.\w+]*)""")
fileContent.forEach {
val res = regex.find(it)
val res = extractImportRegex.find(it)
if (res != null) {
val importedName = res.groupValues[1]
LIBRARIES.forEach { library ->
Expand All @@ -77,9 +80,6 @@ class JavaExtractor : ExtractorInterface {
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
newLine = packageRegex.replace(newLine, "")
Expand All @@ -89,6 +89,7 @@ class JavaExtractor : ExtractorInterface {
override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}
}
23 changes: 16 additions & 7 deletions src/main/kotlin/app/extractors/JavascriptExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ class JavascriptExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val splitRegex =
Regex("""\s+|,|;|:|\*|\n|\(|\)|\[|]|\{|}|\+|=|\.|>|<|#|@|\$""")
val multilineCommentRegex = Regex("""/\*.+?\*/""")
val twoOrMoreWordsRegex = Regex("""(".+?\s.+?"|'.+?\s.+?')""")

val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -23,13 +29,11 @@ class JavascriptExtractor : ExtractorInterface {
}

override fun extractImports(fileContent: List<String>): List<String> {
val splitRegex =
Regex("""\s+|,|;|:|\*|\n|\(|\)|\\[|]|\{|}|\+|=|\.|>|<|#|@|\$""")
val twoOrMoreWordsRegex = Regex("""(".+?\s.+?"|'.+?\s.+?')""")

val line = fileContent.joinToString(separator = " ").toLowerCase()
val fileTokens = twoOrMoreWordsRegex.replace(line, "").split(splitRegex)

val line = fileContent.map { line -> commentRegex.replace(line, "")}
.joinToString(separator = " ").toLowerCase()
val fileTokens = multilineCommentRegex.replace(
twoOrMoreWordsRegex.replace(line, ""), "")
.split(splitRegex)
return fileTokens.filter { token -> token in LIBRARIES }.distinct()
}

Expand All @@ -38,4 +42,9 @@ class JavascriptExtractor : ExtractorInterface {
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}

override fun tokenize(line: String): List<String> {
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
return super.tokenize(commentRegex.replace(line, ""))
}
}
13 changes: 7 additions & 6 deletions src/main/kotlin/app/extractors/KotlinExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ class KotlinExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
val extractImportRegex = Regex("""import\s+(\w+[.\w+]*)""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -24,9 +28,8 @@ class KotlinExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val regex = Regex("""import\s+(\w+[.\w+]*)""")
fileContent.forEach {
val res = regex.find(it)
val res = extractImportRegex.find(it)
if (res != null) {
val importedName = res.groupValues[1]
LIBRARIES.forEach { library ->
Expand All @@ -41,9 +44,6 @@ class KotlinExtractor : ExtractorInterface {
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
newLine = packageRegex.replace(newLine, "")
Expand All @@ -53,6 +53,7 @@ class KotlinExtractor : ExtractorInterface {
override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}
}
14 changes: 7 additions & 7 deletions src/main/kotlin/app/extractors/ObjectiveCExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ class ObjectiveCExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val sharpImportIncludeRegex =
Regex("""#(import|include)\s+[">](\w+)[/\w+]*\.\w+[">]""")
val atImportRegex = Regex("""@import\s+(\w+)""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -24,10 +29,6 @@ class ObjectiveCExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val sharpImportIncludeRegex =
Regex("""#(import|include)\s+[">](\w+)[/\w+]*\.\w+[">]""")
val atImportRegex = Regex("""@import\s+(\w+)""")

fileContent.forEach {
val res = sharpImportIncludeRegex.findAll(it) +
atImportRegex.findAll(it)
Expand All @@ -41,8 +42,6 @@ class ObjectiveCExtractor : ExtractorInterface {
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
Expand All @@ -51,6 +50,7 @@ class ObjectiveCExtractor : ExtractorInterface {
override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}
}
13 changes: 7 additions & 6 deletions src/main/kotlin/app/extractors/PhpExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ class PhpExtractor : ExtractorInterface {
val evaluator by lazy {
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
}
val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
val useRegex = Regex("""use\s+(\w+)[\\\w+]*""")
val requireIncludeRegex = Regex("""(require|require_once|include|""" +
""""include_once)\s*[(]?'(\w+)[.\w+]*'[)]?""")
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -24,9 +29,6 @@ class PhpExtractor : ExtractorInterface {
override fun extractImports(fileContent: List<String>): List<String> {
val imports = mutableSetOf<String>()

val useRegex = Regex("""use\s+(\w+)[\\\w+]*""")
val requireIncludeRegex = Regex("""(require|require_once|include|""" +
""""include_once)\s*[(]?'(\w+)[.\w+]*'[)]?""")
fileContent.forEach {
val res = useRegex.findAll(it) + requireIncludeRegex.findAll(it)
if (res.toList().isNotEmpty()) {
Expand All @@ -39,8 +41,6 @@ class PhpExtractor : ExtractorInterface {
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
Expand All @@ -49,6 +49,7 @@ class PhpExtractor : ExtractorInterface {
override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
return super.getLineLibraries(line, fileLibraries, evaluator,
LANGUAGE_NAME)
}
}
Loading

0 comments on commit aea06a4

Please sign in to comment.