Skip to content

Commit

Permalink
CSVLintBear: Get more info
Browse files Browse the repository at this point in the history
Updates Bear in order to get Line and the erroneous text.
Adjusts Severity Map as per docs.
(https://github.com/theodi/csvlint.rb/blob/master/README.md)

Closes coala#967
  • Loading branch information
nemani committed Dec 18, 2016
1 parent 326b14b commit ee7c9c2
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 8 deletions.
81 changes: 77 additions & 4 deletions bears/csv/CSVLintBear.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import re

from coalib.bearlib.abstractions.Linter import linter
from dependency_management.requirements.GemRequirement import GemRequirement
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY


@linter(executable='csvlint',
output_format='regex',
output_regex=r'\d\. (?P<message>.+(s|g|e|w|d)\.*)'
r'( |$)(?P<additional_info>.*)')
@linter(executable='csvlint')
class CSVLintBear:
"""
Verifies using ``csvlint`` if ``.csv`` files are valid CSV or not.
Expand All @@ -17,7 +17,80 @@ class CSVLintBear:
AUTHORS_EMAILS = {'[email protected]'}
LICENSE = 'AGPL-3.0'
CAN_DETECT = {'Syntax'}
ASCIINEMA_URL = 'https://asciinema.org/a/8fmp2pny34kpqw7t1eoy7phhc'

regex = re.compile(r'\n\d+\.\s(?P<origin>(?P<severity>\w+))\.\s'
r'(Row:\s(?P<line>[0-9]+)\.\s)?(?P<message>.*)?')

severity_map = {
'wrong_content_type': RESULT_SEVERITY.MAJOR,
'ragged_rows': RESULT_SEVERITY.MAJOR,
'blank_rows': RESULT_SEVERITY.MAJOR,
'invalid_encoding': RESULT_SEVERITY.MAJOR,
'not_found': RESULT_SEVERITY.MAJOR,
'stray_quote': RESULT_SEVERITY.MAJOR,
'unclosed_quote': RESULT_SEVERITY.MAJOR,
'whitespace': RESULT_SEVERITY.MAJOR,
'line_breaks': RESULT_SEVERITY.MAJOR,
'no_encoding': RESULT_SEVERITY.NORMAL,
'encoding': RESULT_SEVERITY.NORMAL,
'no_content_type': RESULT_SEVERITY.NORMAL,
'excel': RESULT_SEVERITY.NORMAL,
'check_options': RESULT_SEVERITY.NORMAL,
'inconsistent_values': RESULT_SEVERITY.NORMAL,
'empty_column_name': RESULT_SEVERITY.NORMAL,
'duplicate_column_name': RESULT_SEVERITY.NORMAL,
'title_row': RESULT_SEVERITY.NORMAL,
'nonrfc_line_breaks': RESULT_SEVERITY.INFO,
'assumed_header': RESULT_SEVERITY.INFO}

message_dict = {
'wrong_content_type': 'Content type is not text/csv.',
'ragged_rows': 'Row has a different number of columns. (than the first'
' row in the file)',
'blank_rows': 'Completely empty row, e.g. blank line or a line where'
' all column values are empty.',
'invalid_encoding': 'Encoding error when parsing row, e.g. because of'
' invalid characters.',
'not_found': 'HTTP 404 error when retrieving the data.',
'stray_quotd': 'Missing or stray quote.',
'unclosed_quotd': 'Unclosed quoted field.',
'whitespacd': 'A quoted column has leading or trailing whitespace.',
'line_breakd': 'Line breaks were inconsistent or incorrectly'
' specified.',
'no_encodind': 'The Content-Type header returned in the HTTP request'
' does not have a charset parameter.',
'encoding': 'The character set is not UTF-8.',
'no_content_type': 'File is being served without a Content-Type'
' header.',
'excel': 'No Content-Type header and the file extension is .xls.',
'check_optiond': 'CSV file appears to contain only a single column.',
'inconsistent_valued': 'Inconsistent values in the same column.'
' Reported if <90% of values seem to have same'
' data type. (either numeric or alphanumeric'
' including punctuation)',
'empty_column_name': 'A column in the CSV header has an empty name.',
'duplicate_column_name': 'A column in the CSV header has a duplicate'
' name.',
'title_rod': 'There appears to be a title field in the first row of'
' the CSV.',
'nonrfc_line_breakd': 'Uses non-CRLF line breaks, so does not conform'
' to RFC4180.',
'assumed_headed': 'The validator has assumed that a header is present.'
}

@staticmethod
def create_arguments(filename, file, config_file):
return filename,

@classmethod
def process_output(self, output, filename, file, result_message=None):
for match in re.finditer(self.regex, str(output)):
groups = match.groupdict()
result_message = ' ' + groups['message'] if groups[
'line'] is None else ''
yield self._convert_output_regex_match_to_result(
self,
match, filename, severity_map=self.severity_map,
result_message=self.message_dict[groups['origin']] +
result_message)
51 changes: 47 additions & 4 deletions tests/csv/CSVLintBearTest.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,63 @@
import unittest

from queue import Queue
from bears.csv.CSVLintBear import CSVLintBear
from tests.LocalBearTestHelper import verify_local_bear
from coalib.testing.BearTestHelper import generate_skip_decorator
from coalib.testing.LocalBearTestHelper import verify_local_bear, execute_bear
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
from coalib.settings.Section import Section
from coala_utils.ContextManagers import prepare_file

good_file = """id,first_name,last_name,email,gender,ip_address
1,Cynthia,Rogers,[email protected],Female,158.131.39.207
2,Lisa,Carroll,[email protected],Female,157.69.195.53
3,Kevin,Baker,[email protected],Male,113.189.69.4
"""


bad_file = """id,first_name,last_name,email,gender,ip_address
major_file = """id,first_name,last_name,email,gender,ip_address
1,Cynthia,Rogers,[email protected],Female,158.131.39.207
2,Lisa,Carroll,[email protected],157.69.195.53
3,Kevin,Baker,[email protected],Male,113.189.69.4
"""

normal_file = """id,first_name,last_name,email,gender,ip_address,first_name
1,Cynthia,Rogers,[email protected],Female,158.131.39.207,A
2,Lisa,Carroll,[email protected],Female,157.69.195.53,A
3,Kevin,Baker,[email protected],Male,113.189.69.4,A
"""

CSVLintBearTest = verify_local_bear(CSVLintBear,
valid_files=(good_file,),
invalid_files=(bad_file,))
invalid_files=(major_file, normal_file))


@generate_skip_decorator(CSVLintBear)
class CSVLintBearSeverityTest(unittest.TestCase):

def setUp(self):
self.section = Section('')
self.uut = CSVLintBear(self.section, Queue())

def test_normal(self):
content = normal_file.splitlines()
with prepare_file(content, None) as (file, fname):
with execute_bear(self.uut, fname, file) as results:
self.assertEqual(results[0].severity, RESULT_SEVERITY.NORMAL)
self.assertEqual(results[0].message,
'A column in the CSV header'
' has a duplicate name. Column: 7')
self.assertEqual(results[0].origin,
'CSVLintBear (duplicate_column_name)')
self.assertEqual(results[0].aspect, None)

def test_errors(self):
content = major_file.splitlines()
with prepare_file(content, None) as (file, fname):
with execute_bear(self.uut, fname, file) as results:
self.assertEqual(results[0].severity, RESULT_SEVERITY.MAJOR)
self.assertEqual(results[0].message,
'Row has a different number of columns.'
' (than the first row in the file)')
self.assertEqual(results[0].origin,
'CSVLintBear (ragged_rows)')
self.assertEqual(results[0].aspect, None)

0 comments on commit ee7c9c2

Please sign in to comment.