Added encoding support to avoid faulty behaviour on Windows

valentin-petzold · Dec 3, 2015 · 24d9126 · 24d9126
1 parent 1e03bc1
commit 24d9126
Showing 3 changed files with 73 additions and 30 deletions.
diff --git a/pymzml/run.py b/pymzml/run.py
@@ -36,6 +36,7 @@
 import re
 import os
 import bisect
+import codecs
 
 from xml.etree import cElementTree
 
@@ -162,21 +163,52 @@ def __init__(
         self.OT = self.__init_obo_translator(extraAccessions)
         return
 
+    def __determine_file_encoding(self, path):
+        '''
+        Determines mzML XML encoding using the information in the
+        first line of the mzML. Otherwise falls back to utf-8.
+
+        '''
+        mzML_encoding = 'utf-8'
+        if os.path.exists( path ):
+            # we might have been initialized with a file-object
+            # then no questions about the encoding have to be addressed
+            # is not seekable neither ..
+            sniffer = open(path, 'rb')
+            header = sniffer.readline()
+            encodingPattern = re.compile(
+                b'encoding="(?P<encoding>[A-Za-z0-9-]*)"'
+            )
+            match = encodingPattern.search(header)
+            if match:
+                mzML_encoding = bytes.decode(
+                    match.group('encoding')
+                )
+            sniffer.close()
+        return mzML_encoding
+
+    def _open_file(self, path, given_file_object=None):
+        return self.__open_file( path, given_file_object=given_file_object)
+
     def __open_file(self, path, given_file_object=None):
         # Arbitrary supplied file objects are not seekable
         file_object = given_file_object
         seekable = False
-
+        self.info['encoding'] = self.__determine_file_encoding( path )
         if file_object is None:
+            import codecs
             if path.endswith('.gz'):
                 # Gzipped files are not seekable
                 import gzip
-                import codecs
                 file_object = codecs.getreader("utf-8")(
                     gzip.open(path)
                 )
             else:
-                file_object = open(path, 'r')
+                file_object = codecs.open(
+                    path,
+                    mode     = 'r',
+                    encoding = self.info['encoding']
+                )
                 seekable = True
 
         return file_object, seekable
@@ -198,16 +230,8 @@ def _build_index(self, from_scratch):
                   seeking to a particular offset for the file.
         """
 
-        # Declare the seeker
-        # Read encoding ... maybe not really needed ...
+        # Declare the pre-seeker
         seeker = open(self.info['filename'], 'rb')
-
-        header = seeker.readline()
-        encodingPattern = re.compile(b'encoding="(?P<encoding>[A-Za-z0-9-]*)"')
-        match = encodingPattern.search(header)
-        if match:
-            self.info['encoding'] = bytes.decode(match.group('encoding'))
-
         # Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos
         indexListOffsetPattern = re.compile(
             b'<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>'
@@ -219,7 +243,7 @@ def _build_index(self, from_scratch):
         self.info['offsets']['TIC'] = None
         seeker.seek(0, 2)
         spectrumIndexPattern = RegexPatterns.spectrumIndexPattern
-        for _ in range(10):  # max 10kbyte
+        for _ in range(1, 10):  # max 10kbyte
             # some converters fail in writing a correct index
             # we found
             # a) the offset is always the same (silent fail hurray!)
@@ -298,7 +322,12 @@ def _build_index(self, from_scratch):
                     self.info['offsetList'].append(offset)
             # opening seeker in normal mode again
         seeker.close()
-        seeker = open(self.info['filename'], 'r')
+        seeker = codecs.open(
+            self.info['filename'],
+            mode     = 'r',
+            encoding = self.info['encoding']
+        )
+        # seeker = open(self.info['filename'], 'r')
 
         return seeker
 
@@ -559,17 +588,19 @@ def __getitem__(self, value):
                 answer = self.spectrum
         else:
             # Reopen the file from the beginning if possible
-            self.info['fileObject'].close()
-
-            assert self.info['filename'], \
-                'Must specify either filename or index for random spectrum access'
-            self.info['fileObject'], _ = self.__open_file(self.info['filename'])
-            self.iter = self.__init_iter()
-
-            for _ in self:
-                if _['id'] == value:
-                    answer = _
-                    break
+            force_seeking = self.info.get('force_seeking', False)
+            if force_seeking is False:
+                self.info['fileObject'].close()
+
+                assert self.info['filename'], \
+                    'Must specify either filename or index for random spectrum access'
+                self.info['fileObject'], _ = self.__open_file(self.info['filename'])
+                self.iter = self.__init_iter()
+
+                for spec in self:
+                    if spec['id'] == value:
+                        answer = spec
+                        break
 
         if answer is None:
             raise KeyError("Run does not contain spec with native ID {0}".format(value))

diff --git a/test/data/example.mzml b/test/data/example.mzml
@@ -541,9 +541,9 @@
       <offset idRef="controllerType=0 controllerNumber=1 scan=10">132417</offset>
     </index>
     <index name="chromatogram">
-      <offset idRef="TIC">48372822</offset>
+      <offset idRef="TIC" info="just any number since not existend">132417</offset>
     </index>
   </indexList>
-  <indexListOffset>48420141</indexListOffset>
-  <fileChecksum>c4261af4c71774161cc0a3c62d3cde3adfba4ac7</fileChecksum>
+  <indexListOffset>210679</indexListOffset>
+  <fileChecksum>MZMLDemoFile</fileChecksum>
 </indexedmzML>
diff --git a/test/test_run.py b/test/test_run.py
@@ -26,12 +26,12 @@
 
 class TestRun(unittest.TestCase):
     def setUp(self):
-        example_mzml_filename = os.path.join(
+        self.example_mzml_path = os.path.join(
             os.path.dirname(__file__),
             'data',
             'example.mzml',
         )
-        self.example_mzml = open(example_mzml_filename)
+        self.example_mzml = open(self.example_mzml_path)
 
     def tearDown(self):
         self.example_mzml.close()
@@ -75,6 +75,18 @@ def test_regex(self):
         self.assertEqual(match_sim.group('nativeID'), b"SIM SIC 651.5")
         self.assertEqual(match_sim.group('offset'), b"330223452")
 
+    def test_mzML_encoding(self):
+        run = pymzml.run.Reader(self.example_mzml_path)
+        self.assertEqual(run.info['encoding'], 'ISO-8859-1')
+
+    def test_seeker(self):
+        run = pymzml.run.Reader(self.example_mzml_path)
+        # fo, is_seekable = run._open_file(self.example_mzml_path)
+        # self.assertTrue(is_seekable)
+        self.assertTrue(run.info['seekable'])
+        run.info['force_seeking'] = True
+        spec = run[9]
+        self.assertEqual(spec['defaultArrayLength'], 1069)
 
 
 if __name__ == '__main__':