sgml to xml

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@146 1aa58f4a-7d42-0410-adbc-911cccaed67c
phoenixlqh · Oct 31, 2009 · 78f7866 · 78f7866
1 parent 23b8058
commit 78f7866
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 26 deletions.
diff --git a/TODO b/TODO
@@ -1,5 +1,4 @@
 TODOs:
-  - sgml->xml
   - PEP-8 conformance.
   - Better text extraction / layout analysis.
   - Better API Documentation.

diff --git a/docs/index.html b/docs/index.html
@@ -19,7 +19,7 @@ <h1>PDFMiner</h1>
 
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat Oct 31 11:08:31 JST 2009
+Last Modified: Sat Oct 31 12:03:49 JST 2009
 <!-- hhmts end -->
 </div>
 
@@ -209,7 +209,7 @@ <h3>pdf2txt.py</h3>
 <ul>
 <li> <code>text</code> : TEXT format. (Default)
 <li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
-<li> <code>sgml</code> : SGML format. Provides the most information available.
+<li> <code>xml</code> : XML format. Provides the most information available.
 <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
 HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
 Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -156,10 +156,16 @@ def write(self, text):
         return
 
 
-##  SGMLConverter
+##  XMLConverter
 ##
-class SGMLConverter(PDFConverter):
+class XMLConverter(PDFConverter):
 
+    def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
+        PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
+        self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
+        self.outfp.write('<pages>\n')
+        return
+
     def end_page(self, page):
         def render(item):
             if isinstance(item, LTPage):
@@ -202,6 +208,10 @@ def render(item):
         render(page)
         return
 
+    def close(self):
+        self.outfp.write('</pages>\n')
+        return
+
 
 ##  HTMLConverter
 ##

diff --git a/samples/Makefile b/samples/Makefile
@@ -25,30 +25,33 @@ TEXTS= \
 	naacl06-shinyama.txt \
 	nlp2004slides.txt
 
-SGMLS= \
-	simple1.sgml \
-	simple2.sgml \
-	dmca.sgml \
-	f1040nr.sgml \
-	i1040nr.sgml \
-	jo.sgml \
-	kampo.sgml \
-	naacl06-shinyama.sgml \
-	nlp2004slides.sgml
+XMLS= \
+	simple1.xml \
+	simple2.xml \
+	dmca.xml \
+	f1040nr.xml \
+	i1040nr.xml \
+	jo.xml \
+	kampo.xml \
+	naacl06-shinyama.xml \
+	nlp2004slides.xml
 
 all:
 
 clean:
 	-rm $(HTMLS)
 	-rm $(TEXTS)
-	-rm $(SGMLS)
+	-rm $(XMLS)
 
-test: $(HTMLS) $(TEXTS) $(SGMLS)
+test: htmls texts xmls
+htmls: $(HTMLS)
+tests: $(TEXTS)
+xmls: $(XMLS)
 
-.SUFFIXES: .pdf .html .sgml .txt
+.SUFFIXES: .pdf .html .xml .txt
 .pdf.html:
 	$(PDF2TXT) -t html $< > $@
-.pdf.sgml:
-	$(PDF2TXT) -t sgml $< > $@
+.pdf.xml:
+	$(PDF2TXT) -t xml $< > $@
 .pdf.txt:
 	$(PDF2TXT) -t text $< > $@
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
@@ -3,7 +3,7 @@
 from pdfminer.pdfparser import PDFDocument, PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
 from pdfminer.pdfdevice import PDFDevice
-from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
+from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
 from pdfminer.cmap import CMapDB, find_cmap_path
 from pdfminer.layout import LAParams
 
@@ -13,7 +13,7 @@ def main(argv):
     def usage():
         print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
                '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
-               '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
+               '[-t text|html|xml|tag] [-o output] file ...' % argv[0])
         return 100
     try:
         (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
@@ -65,8 +65,8 @@ def usage():
         if outfile:
             if outfile.endswith('.htm') or outfile.endswith('.html'):
                 outtype = 'html'
-            elif outfile.endswith('.sgml'):
-                outtype = 'sgml'
+            elif outfile.endswith('.xml'):
+                outtype = 'xml'
             elif outfile.endswith('.tag'):
                 outtype = 'tag'
     if outfile:
@@ -75,8 +75,8 @@ def usage():
         outfp = sys.stdout
     if outtype == 'text':
         device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
-    elif outtype == 'sgml':
-        device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
+    elif outtype == 'xml':
+        device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
     elif outtype == 'html':
         device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
     elif outtype == 'tag':