fix: do not process js comments (sourcerer-io#165) (sourcerer-io#196)

* fix: do not process js comments (sourcerer-io#165) * wip: fix pr * fix: style
lulu1546 · Feb 16, 2018 · aea06a4 · aea06a4
1 parent affed37
commit aea06a4
Show file tree

Hide file tree

Showing 14 changed files with 105 additions and 70 deletions.
diff --git a/src/main/kotlin/app/extractors/CExtractor.kt b/src/main/kotlin/app/extractors/CExtractor.kt
@@ -14,6 +14,10 @@ class CExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val extractImportRegex =
+            Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,9 +28,8 @@ class CExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val regex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
         fileContent.forEach {
-            val res = regex.find(it)
+            val res = extractImportRegex.find(it)
             if (res != null) {
                 val lineLib = res.groupValues.last()
                 imports.add(lineLib)
@@ -37,8 +40,6 @@ class CExtractor : ExtractorInterface {
     }
 
     override fun tokenize(line: String): List<String> {
-        val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
-        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
         var newLine = importRegex.replace(line, "")
         newLine = commentRegex.replace(newLine, "")
         return super.tokenize(newLine)

diff --git a/src/main/kotlin/app/extractors/CSharpExtractor.kt b/src/main/kotlin/app/extractors/CSharpExtractor.kt
@@ -15,6 +15,9 @@ class CSharpExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val extractImportRegex = Regex("""using\s+(\w+[.\w+]*)""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -25,9 +28,8 @@ class CSharpExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val regex = Regex("""using\s+(\w+[.\w+]*)""")
         fileContent.forEach {
-            val res = regex.find(it)
+            val res = extractImportRegex.find(it)
             if (res != null) {
                 val importedName = res.groupValues[1]
                 LIBRARIES.forEach { library ->
@@ -52,6 +54,7 @@ class CSharpExtractor : ExtractorInterface {
     override fun getLineLibraries(line: String,
                                   fileLibraries: List<String>): List<String> {
 
-        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+        return super.getLineLibraries(line, fileLibraries, evaluator,
+            LANGUAGE_NAME)
     }
 }
diff --git a/src/main/kotlin/app/extractors/CppExtractor.kt b/src/main/kotlin/app/extractors/CppExtractor.kt
@@ -16,6 +16,9 @@ class CppExtractor : ExtractorInterface {
         }
         val MULTI_IMPORT_TO_LIB =
             ExtractorInterface.getMultipleImportsToLibraryMap(LANGUAGE_NAME)
+        val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val extractImportRegex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -26,9 +29,8 @@ class CppExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val regex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
         fileContent.forEach {
-            val res = regex.find(it)
+            val res = extractImportRegex.find(it)
             if (res != null) {
                 val lineLib = res.groupValues.last()
                 imports.add(lineLib)
@@ -40,8 +42,6 @@ class CppExtractor : ExtractorInterface {
     }
 
     override fun tokenize(line: String): List<String> {
-        val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
-        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
         var newLine = importRegex.replace(line, "")
         newLine = commentRegex.replace(newLine, "")
         return super.tokenize(newLine)
@@ -50,6 +50,7 @@ class CppExtractor : ExtractorInterface {
     override fun getLineLibraries(line: String,
                                   fileLibraries: List<String>): List<String> {
 
-        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+        return super.getLineLibraries(line, fileLibraries, evaluator,
+            LANGUAGE_NAME)
     }
 }
diff --git a/src/main/kotlin/app/extractors/ExtractorInterface.kt b/src/main/kotlin/app/extractors/ExtractorInterface.kt
@@ -20,6 +20,9 @@ interface ExtractorInterface {
         private val classifiersCache = hashMapOf<String, Classifier>()
         private val modelsDir = "models"
         private val pbExt = ".pb"
+        val stringRegex = Regex("""(".+?"|'.+?')""")
+        val splitRegex =
+                Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")
 
         private fun getResource(path: String): InputStream {
             return ExtractorInterface::class.java.classLoader
@@ -173,11 +176,8 @@ interface ExtractorInterface {
     }
 
     fun tokenize(line: String): List<String> {
-        val stringRegex = Regex("""(".+?"|'.+?')""")
         val newLine = stringRegex.replace(line, "")
         //TODO(lyaronskaya): multiline comment regex
-        val splitRegex =
-            Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")
         val tokens = splitRegex.split(newLine)
             .filter { it.isNotBlank() && !it.contains('"') && !it.contains('\'')
                 && it != "-" && it != "@"}

diff --git a/src/main/kotlin/app/extractors/GoExtractor.kt b/src/main/kotlin/app/extractors/GoExtractor.kt
@@ -14,6 +14,11 @@ class GoExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val importRegex = Regex("""^(.*import)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val singleImportRegex = Regex("""import\s+"(\w+)"""")
+        val multipleImportRegex = Regex("""import[\s\t\n]+\((.+?)\)""",
+                RegexOption.DOT_MATCHES_ALL)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,31 +29,27 @@ class GoExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val singleImportRegex = Regex("""import\s+"(\w+)"""")
         fileContent.forEach {
             val res = singleImportRegex.find(it)
             if (res != null) {
                 val lineLib = res.groupValues.last()
                 imports.add(lineLib)
             }
         }
-        val multipleImportRegex = Regex("""import[\s\t\n]+\((.+?)\)""",
-                RegexOption.DOT_MATCHES_ALL)
         val contentJoined = fileContent.joinToString(separator = "")
         multipleImportRegex.findAll(contentJoined).forEach { matchResult ->
             imports.addAll(matchResult.groupValues.last()
                 .split(Regex("""(\t+|\n+|\s+|")"""))
                 .filter { it.isNotEmpty() }
                 .map { it -> it.replace("\"", "") }
-                .map { it ->  if (it.contains("github.com")) it.split("/")[2] else it})
+                .map { it ->  if (it.contains("github.com")) it.split("/")[2]
+                    else it})
         }
 
         return imports.toList()
     }
 
     override fun tokenize(line: String): List<String> {
-        val importRegex = Regex("""^(.*import)\s[^\n]*""")
-        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
         var newLine = importRegex.replace(line, "")
         newLine = commentRegex.replace(newLine, "")
         return super.tokenize(newLine)
@@ -57,6 +58,7 @@ class GoExtractor : ExtractorInterface {
     override fun getLineLibraries(line: String,
                                   fileLibraries: List<String>): List<String> {
 
-        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+        return super.getLineLibraries(line, fileLibraries, evaluator,
+            LANGUAGE_NAME)
     }
 }
diff --git a/src/main/kotlin/app/extractors/JavaExtractor.kt b/src/main/kotlin/app/extractors/JavaExtractor.kt
@@ -23,6 +23,10 @@ class JavaExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val importRegex = Regex("""^(.*import)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val packageRegex = Regex("""^(.*package)\s[^\n]*""")
+        val extractImportRegex = Regex("""import\s+(\w+[.\w+]*)""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -60,9 +64,8 @@ class JavaExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val regex = Regex("""import\s+(\w+[.\w+]*)""")
         fileContent.forEach {
-            val res = regex.find(it)
+            val res = extractImportRegex.find(it)
             if (res != null) {
                 val importedName = res.groupValues[1]
                 LIBRARIES.forEach { library ->
@@ -77,9 +80,6 @@ class JavaExtractor : ExtractorInterface {
     }
 
     override fun tokenize(line: String): List<String> {
-        val importRegex = Regex("""^(.*import)\s[^\n]*""")
-        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
-        val packageRegex = Regex("""^(.*package)\s[^\n]*""")
         var newLine = importRegex.replace(line, "")
         newLine = commentRegex.replace(newLine, "")
         newLine = packageRegex.replace(newLine, "")
@@ -89,6 +89,7 @@ class JavaExtractor : ExtractorInterface {
     override fun getLineLibraries(line: String,
                                   fileLibraries: List<String>): List<String> {
 
-        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+        return super.getLineLibraries(line, fileLibraries, evaluator,
+            LANGUAGE_NAME)
     }
 }
diff --git a/src/main/kotlin/app/extractors/JavascriptExtractor.kt b/src/main/kotlin/app/extractors/JavascriptExtractor.kt
@@ -15,6 +15,12 @@ class JavascriptExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val splitRegex =
+                Regex("""\s+|,|;|:|\*|\n|\(|\)|\[|]|\{|}|\+|=|\.|>|<|#|@|\$""")
+        val multilineCommentRegex = Regex("""/\*.+?\*/""")
+        val twoOrMoreWordsRegex = Regex("""(".+?\s.+?"|'.+?\s.+?')""")
+
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -23,13 +29,11 @@ class JavascriptExtractor : ExtractorInterface {
     }
 
     override fun extractImports(fileContent: List<String>): List<String> {
-        val splitRegex =
-            Regex("""\s+|,|;|:|\*|\n|\(|\)|\\[|]|\{|}|\+|=|\.|>|<|#|@|\$""")
-        val twoOrMoreWordsRegex = Regex("""(".+?\s.+?"|'.+?\s.+?')""")
-
-        val line = fileContent.joinToString(separator = " ").toLowerCase()
-        val fileTokens = twoOrMoreWordsRegex.replace(line, "").split(splitRegex)
-
+        val line = fileContent.map { line -> commentRegex.replace(line, "")}
+                       .joinToString(separator = " ").toLowerCase()
+        val fileTokens = multilineCommentRegex.replace(
+                            twoOrMoreWordsRegex.replace(line, ""), "")
+                            .split(splitRegex)
         return fileTokens.filter { token -> token in LIBRARIES }.distinct()
     }
 
@@ -38,4 +42,9 @@ class JavascriptExtractor : ExtractorInterface {
         return super.getLineLibraries(line, fileLibraries, evaluator,
             LANGUAGE_NAME)
     }
+
+    override fun tokenize(line: String): List<String> {
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        return super.tokenize(commentRegex.replace(line, ""))
+    }
 }
diff --git a/src/main/kotlin/app/extractors/KotlinExtractor.kt b/src/main/kotlin/app/extractors/KotlinExtractor.kt
@@ -14,6 +14,10 @@ class KotlinExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val importRegex = Regex("""^(.*import)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val packageRegex = Regex("""^(.*package)\s[^\n]*""")
+        val extractImportRegex = Regex("""import\s+(\w+[.\w+]*)""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,9 +28,8 @@ class KotlinExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val regex = Regex("""import\s+(\w+[.\w+]*)""")
         fileContent.forEach {
-            val res = regex.find(it)
+            val res = extractImportRegex.find(it)
             if (res != null) {
                 val importedName = res.groupValues[1]
                 LIBRARIES.forEach { library ->
@@ -41,9 +44,6 @@ class KotlinExtractor : ExtractorInterface {
     }
 
     override fun tokenize(line: String): List<String> {
-        val importRegex = Regex("""^(.*import)\s[^\n]*""")
-        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
-        val packageRegex = Regex("""^(.*package)\s[^\n]*""")
         var newLine = importRegex.replace(line, "")
         newLine = commentRegex.replace(newLine, "")
         newLine = packageRegex.replace(newLine, "")
@@ -53,6 +53,7 @@ class KotlinExtractor : ExtractorInterface {
     override fun getLineLibraries(line: String,
                                   fileLibraries: List<String>): List<String> {
 
-        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+        return super.getLineLibraries(line, fileLibraries, evaluator,
+            LANGUAGE_NAME)
     }
 }
diff --git a/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt b/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt
@@ -14,6 +14,11 @@ class ObjectiveCExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val sharpImportIncludeRegex =
+                Regex("""#(import|include)\s+[">](\w+)[/\w+]*\.\w+[">]""")
+        val atImportRegex = Regex("""@import\s+(\w+)""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,10 +29,6 @@ class ObjectiveCExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val sharpImportIncludeRegex =
-            Regex("""#(import|include)\s+[">](\w+)[/\w+]*\.\w+[">]""")
-        val atImportRegex = Regex("""@import\s+(\w+)""")
-
         fileContent.forEach {
             val res = sharpImportIncludeRegex.findAll(it) +
                 atImportRegex.findAll(it)
@@ -41,8 +42,6 @@ class ObjectiveCExtractor : ExtractorInterface {
     }
 
     override fun tokenize(line: String): List<String> {
-        val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
-        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
         var newLine = importRegex.replace(line, "")
         newLine = commentRegex.replace(newLine, "")
         return super.tokenize(newLine)
@@ -51,6 +50,7 @@ class ObjectiveCExtractor : ExtractorInterface {
     override fun getLineLibraries(line: String,
                                   fileLibraries: List<String>): List<String> {
 
-        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+        return super.getLineLibraries(line, fileLibraries, evaluator,
+            LANGUAGE_NAME)
     }
 }
diff --git a/src/main/kotlin/app/extractors/PhpExtractor.kt b/src/main/kotlin/app/extractors/PhpExtractor.kt
@@ -14,6 +14,11 @@ class PhpExtractor : ExtractorInterface {
         val evaluator by lazy {
             ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
         }
+        val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        val useRegex = Regex("""use\s+(\w+)[\\\w+]*""")
+        val requireIncludeRegex = Regex("""(require|require_once|include|""" +
+                """"include_once)\s*[(]?'(\w+)[.\w+]*'[)]?""")
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,9 +29,6 @@ class PhpExtractor : ExtractorInterface {
     override fun extractImports(fileContent: List<String>): List<String> {
         val imports = mutableSetOf<String>()
 
-        val useRegex = Regex("""use\s+(\w+)[\\\w+]*""")
-        val requireIncludeRegex = Regex("""(require|require_once|include|""" +
-            """"include_once)\s*[(]?'(\w+)[.\w+]*'[)]?""")
         fileContent.forEach {
             val res = useRegex.findAll(it) + requireIncludeRegex.findAll(it)
             if (res.toList().isNotEmpty()) {
@@ -39,8 +41,6 @@ class PhpExtractor : ExtractorInterface {
     }
 
     override fun tokenize(line: String): List<String> {
-        val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
-        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
         var newLine = importRegex.replace(line, "")
         newLine = commentRegex.replace(newLine, "")
         return super.tokenize(newLine)
@@ -49,6 +49,7 @@ class PhpExtractor : ExtractorInterface {
     override fun getLineLibraries(line: String,
                                   fileLibraries: List<String>): List<String> {
 
-        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+        return super.getLineLibraries(line, fileLibraries, evaluator,
+            LANGUAGE_NAME)
     }
 }