Skip to content

Commit

Permalink
Ignore namespaces while reading.
Browse files Browse the repository at this point in the history
databricks#74

This is a workaround to read a XML files with namespaces.
Currently, this ignores namespaces but we might need to handle this by options or other ways.

This PR makes this library able to read a XML file rather than ignoring the rows as malformed rows below:

```bash
11:25:32.517 WARN com.databricks.spark.xml.util.InferSchema$: Dropping malformed row: <Topic r:id="">        <catid>1</catid>    </Topic>
root
```

Author: hyukjinkwon <[email protected]>

Closes databricks#75 from HyukjinKwon/ISSUE-74-namespace.
  • Loading branch information
HyukjinKwon committed Jan 27, 2016
1 parent eb6e79f commit 5ef10e3
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ private[xml] object StaxXmlParser {
// It does not have to skip for white space, since `XmlInputFormat`
// always finds the root tag without a heading space.
val factory = XMLInputFactory.newInstance()
factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false)
val reader = new ByteArrayInputStream(xml.getBytes)
val parser = factory.createXMLEventReader(reader)
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ private[xml] object InferSchema {
// It does not have to skip for white space, since [[XmlInputFormat]]
// always finds the root tag without a heading space.
val factory = XMLInputFactory.newInstance()
factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false)
val reader = new ByteArrayInputStream(xml.getBytes)
val parser = factory.createXMLEventReader(reader)
try {
Expand Down
7 changes: 7 additions & 0 deletions src/test/resources/topics-namespaces.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<RDF xmlns:r="http://www.w3.org/TR/RDF/" xmlns:d="http://purl.org/dc/elements/1.0/" xmlns="http://dmoz.org/rdf/">
<!-- Generated at 2016-01-24 00:05:51 EST from DMOZ 2.0 -->
<Topic r:id="">
<catid>1</catid>
</Topic>
</RDF>
11 changes: 11 additions & 0 deletions src/test/scala/com/databricks/spark/xml/XmlSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,16 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {
val carsMalformedFile = "src/test/resources/cars-malformed.xml"
val nullNumbersFile = "src/test/resources/null-numbers.xml"
val emptyFile = "src/test/resources/empty.xml"
val topicsFile = "src/test/resources/topics-namespaces.xml"

val booksTag = "book"
val booksRootTag = "books"
val topicsTag = "Topic"

val numCars = 3
val numBooks = 12
val numBooksComplicated = 3
val numTopics = 1

private var sqlContext: SQLContext = _

Expand Down Expand Up @@ -572,4 +575,12 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {

assert(results(1).toSeq === Seq("bob", null))
}

test("DSL test with namespaces ignored") {
val results = sqlContext
.xmlFile(topicsFile, rowTag = topicsTag)
.collect()

assert(results.size === numTopics)
}
}

0 comments on commit 5ef10e3

Please sign in to comment.