Skip to content

Commit

Permalink
Added encoding support to avoid faulty behaviour on Windows
Browse files Browse the repository at this point in the history
  • Loading branch information
fu committed Dec 3, 2015
1 parent 1e03bc1 commit 24d9126
Showing 3 changed files with 73 additions and 30 deletions.
81 changes: 56 additions & 25 deletions pymzml/run.py
Original file line number Diff line number Diff line change
@@ -36,6 +36,7 @@
import re
import os
import bisect
import codecs

from xml.etree import cElementTree

@@ -162,21 +163,52 @@ def __init__(
self.OT = self.__init_obo_translator(extraAccessions)
return

def __determine_file_encoding(self, path):
'''
Determines mzML XML encoding using the information in the
first line of the mzML. Otherwise falls back to utf-8.
'''
mzML_encoding = 'utf-8'
if os.path.exists( path ):
# we might have been initialized with a file-object
# then no questions about the encoding have to be addressed
# is not seekable neither ..
sniffer = open(path, 'rb')
header = sniffer.readline()
encodingPattern = re.compile(
b'encoding="(?P<encoding>[A-Za-z0-9-]*)"'
)
match = encodingPattern.search(header)
if match:
mzML_encoding = bytes.decode(
match.group('encoding')
)
sniffer.close()
return mzML_encoding

def _open_file(self, path, given_file_object=None):
return self.__open_file( path, given_file_object=given_file_object)

def __open_file(self, path, given_file_object=None):
# Arbitrary supplied file objects are not seekable
file_object = given_file_object
seekable = False

self.info['encoding'] = self.__determine_file_encoding( path )
if file_object is None:
import codecs
if path.endswith('.gz'):
# Gzipped files are not seekable
import gzip
import codecs
file_object = codecs.getreader("utf-8")(
gzip.open(path)
)
else:
file_object = open(path, 'r')
file_object = codecs.open(
path,
mode = 'r',
encoding = self.info['encoding']
)
seekable = True

return file_object, seekable
@@ -198,16 +230,8 @@ def _build_index(self, from_scratch):
seeking to a particular offset for the file.
"""

# Declare the seeker
# Read encoding ... maybe not really needed ...
# Declare the pre-seeker
seeker = open(self.info['filename'], 'rb')

header = seeker.readline()
encodingPattern = re.compile(b'encoding="(?P<encoding>[A-Za-z0-9-]*)"')
match = encodingPattern.search(header)
if match:
self.info['encoding'] = bytes.decode(match.group('encoding'))

# Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos
indexListOffsetPattern = re.compile(
b'<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>'
@@ -219,7 +243,7 @@ def _build_index(self, from_scratch):
self.info['offsets']['TIC'] = None
seeker.seek(0, 2)
spectrumIndexPattern = RegexPatterns.spectrumIndexPattern
for _ in range(10): # max 10kbyte
for _ in range(1, 10): # max 10kbyte
# some converters fail in writing a correct index
# we found
# a) the offset is always the same (silent fail hurray!)
@@ -298,7 +322,12 @@ def _build_index(self, from_scratch):
self.info['offsetList'].append(offset)
# opening seeker in normal mode again
seeker.close()
seeker = open(self.info['filename'], 'r')
seeker = codecs.open(
self.info['filename'],
mode = 'r',
encoding = self.info['encoding']
)
# seeker = open(self.info['filename'], 'r')

return seeker

@@ -559,17 +588,19 @@ def __getitem__(self, value):
answer = self.spectrum
else:
# Reopen the file from the beginning if possible
self.info['fileObject'].close()

assert self.info['filename'], \
'Must specify either filename or index for random spectrum access'
self.info['fileObject'], _ = self.__open_file(self.info['filename'])
self.iter = self.__init_iter()

for _ in self:
if _['id'] == value:
answer = _
break
force_seeking = self.info.get('force_seeking', False)
if force_seeking is False:
self.info['fileObject'].close()

assert self.info['filename'], \
'Must specify either filename or index for random spectrum access'
self.info['fileObject'], _ = self.__open_file(self.info['filename'])
self.iter = self.__init_iter()

for spec in self:
if spec['id'] == value:
answer = spec
break

if answer is None:
raise KeyError("Run does not contain spec with native ID {0}".format(value))
6 changes: 3 additions & 3 deletions test/data/example.mzml
Original file line number Diff line number Diff line change
@@ -541,9 +541,9 @@
<offset idRef="controllerType=0 controllerNumber=1 scan=10">132417</offset>
</index>
<index name="chromatogram">
<offset idRef="TIC">48372822</offset>
<offset idRef="TIC" info="just any number since not existend">132417</offset>
</index>
</indexList>
<indexListOffset>48420141</indexListOffset>
<fileChecksum>c4261af4c71774161cc0a3c62d3cde3adfba4ac7</fileChecksum>
<indexListOffset>210679</indexListOffset>
<fileChecksum>MZMLDemoFile</fileChecksum>
</indexedmzML>
16 changes: 14 additions & 2 deletions test/test_run.py
Original file line number Diff line number Diff line change
@@ -26,12 +26,12 @@

class TestRun(unittest.TestCase):
def setUp(self):
example_mzml_filename = os.path.join(
self.example_mzml_path = os.path.join(
os.path.dirname(__file__),
'data',
'example.mzml',
)
self.example_mzml = open(example_mzml_filename)
self.example_mzml = open(self.example_mzml_path)

def tearDown(self):
self.example_mzml.close()
@@ -75,6 +75,18 @@ def test_regex(self):
self.assertEqual(match_sim.group('nativeID'), b"SIM SIC 651.5")
self.assertEqual(match_sim.group('offset'), b"330223452")

def test_mzML_encoding(self):
run = pymzml.run.Reader(self.example_mzml_path)
self.assertEqual(run.info['encoding'], 'ISO-8859-1')

def test_seeker(self):
run = pymzml.run.Reader(self.example_mzml_path)
# fo, is_seekable = run._open_file(self.example_mzml_path)
# self.assertTrue(is_seekable)
self.assertTrue(run.info['seekable'])
run.info['force_seeking'] = True
spec = run[9]
self.assertEqual(spec['defaultArrayLength'], 1069)


if __name__ == '__main__':

0 comments on commit 24d9126

Please sign in to comment.