Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Capture more of the Document #668

Merged
merged 1 commit into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Capture more of the Document
- handle Locator2, DTDHandler, DeclHandler, and LexicalHandler DTD and entity methods;
- introduce DtdBuilder and delegate DTD methods to it;
- introduce ElementContentModel and delegate parsing of the element content model in the DTD element declarations to it;
- XMLLoader.loadDocument() returns baseURI, XML version, encoding and standAlone flag and DTD;
- tests demonstrating the new returns;
  • Loading branch information
dubinsky committed Jun 20, 2023
commit 0203fae7ff1e370702a6efbc9b772564ee688946
Binary file added jvm/src/test/resources/scala/xml/utf16.xml
Binary file not shown.
2 changes: 2 additions & 0 deletions jvm/src/test/resources/scala/xml/utf8.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<a/>
179 changes: 172 additions & 7 deletions jvm/src/test/scala/scala/xml/XMLTest.scala
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package scala.xml

import org.junit.{Test => UnitTest}
import org.junit.Assert.{assertEquals, assertFalse, assertTrue}
import org.junit.Assert.{assertEquals, assertFalse, assertNull, assertThrows, assertTrue}
import java.io.StringWriter
import java.io.ByteArrayOutputStream
import java.net.URL
import scala.xml.dtd.{DocType, PublicID}
import scala.xml.parsing.ConstructingParser
import scala.xml.Utility.sort
Expand Down Expand Up @@ -681,14 +682,16 @@ class XMLTestJVM {
assertTrue(gotAnError)
}

def resourceUrl(resourceName: String): URL = getClass.getResource(s"$resourceName.xml")

// Here we see that opening InputStream prematurely, as was done previously, breaks XInclude.
@UnitTest(expected = classOf[org.xml.sax.SAXParseException]) def xIncludeNeedsSystemId(): Unit = {
val parserFactory = xercesInternal
parserFactory.setNamespaceAware(true)
parserFactory.setXIncludeAware(true)
XML
.withSAXParser(parserFactory.newSAXParser)
.load(getClass.getResource("site.xml").openStream())
.load(resourceUrl("site").openStream())
.toString
}

Expand All @@ -703,7 +706,7 @@ class XMLTestJVM {
parserFactory.setXIncludeAware(true)
val actual: String = XML
.withSAXParser(parserFactory.newSAXParser)
.load(getClass.getResource(resourceName))
.load(resourceUrl(resourceName))
.toString

assertEquals(expected, actual)
Expand All @@ -718,8 +721,8 @@ class XMLTestJVM {
|</includee>
|</includer>""".stripMargin

@UnitTest def xIncludeWithExternalXerces(): Unit = check(xercesExternal, "includer.xml", includerExpected)
@UnitTest def xIncludeWithInternalXerces(): Unit = check(xercesInternal, "includer.xml", includerExpected)
@UnitTest def xIncludeWithExternalXerces(): Unit = check(xercesExternal, "includer", includerExpected)
@UnitTest def xIncludeWithInternalXerces(): Unit = check(xercesInternal, "includer", includerExpected)

// And here we demonstrate that both external and built-in Xerces report incorrect `xml:base`
// when the XML file included contains its own include, and included files are not in the same directory:
Expand Down Expand Up @@ -750,8 +753,170 @@ class XMLTestJVM {
//
// I find it utterly incomprehensible that foundational library shipped with JDK and used everywhere
// has a bug in its core functionality for years and it never gets fixed, but sadly, it is the state of affairs:
@UnitTest def xIncludeFailWithExternalXerces(): Unit = check(xercesExternal, "site.xml", siteUnfortunatelyExpected)
@UnitTest def xIncludeFailWithInternalXerces(): Unit = check(xercesInternal, "site.xml", siteUnfortunatelyExpected)
@UnitTest def xIncludeFailWithExternalXerces(): Unit = check(xercesExternal, "site", siteUnfortunatelyExpected)
@UnitTest def xIncludeFailWithInternalXerces(): Unit = check(xercesInternal, "site", siteUnfortunatelyExpected)

@UnitTest
def documentBaseURI(): Unit = {
val url: URL = resourceUrl("site")
// XMLLoader returns the document's baseURI:
assert(XML.withSAXParser(xercesInternal.newSAXParser).loadDocument(url).baseURI.endsWith("/test-classes/scala/xml/site.xml"))
assert(XML.withSAXParser(xercesExternal.newSAXParser).loadDocument(url).baseURI.endsWith("/test-classes/scala/xml/site.xml"))
// ConstructingParser does not return it of course: since it uses scala.io.Source it has no idea where is the XML coming from:
assertNull(ConstructingParser.fromSource(scala.io.Source.fromURI(url.toURI), preserveWS = false).document().baseURI)
}

@UnitTest
def xmlStandAlone(): Unit = {
val standAlone: String = s"""<?xml version="1.0" standalone="yes"?><a/>"""
val nonStandAlone: String = s"""<?xml version="1.0" standalone="no"?><a/>"""
val default: String = s"""<?xml version="1.0"?><a/>"""
val noXmlDeclaration: String = s"""<a/>"""

// ConstructingParser returns standAlone status of the document straight from the `xml` declaration:
assertEquals(Some(true ), ConstructingParser.fromSource(scala.io.Source.fromString(standAlone), preserveWS = false).document().standAlone)
assertEquals(Some(false), ConstructingParser.fromSource(scala.io.Source.fromString(nonStandAlone), preserveWS = false).document().standAlone)
assertTrue(ConstructingParser.fromSource(scala.io.Source.fromString(default), preserveWS = false).document().standAlone.isEmpty)
// ConstructingParser incorrectly returns null standAlone value when the document does not have the xml declaration:
assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().standAlone)

// XMLLoader returns standAlone status of the document straight from the `xml` declaration:
assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(standAlone).standAlone.contains(true))
assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(nonStandAlone).standAlone.contains(false))
assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(default).standAlone.contains(false))
assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(noXmlDeclaration).standAlone.contains(false))
}

@UnitTest
def xmlVersion(): Unit = {
val xml10 = s"""<?xml version="1.0"?><a/>"""
val xml11 = s"""<?xml version="1.1"?><a/>"""
val noXmlDeclaration: String = s"""<a/>"""

// ConstructingParser returns XML version of the document straight from the `xml` declaration for version="1.0":
assertEquals(Some("1.0"), ConstructingParser.fromSource(scala.io.Source.fromString(xml10), preserveWS = false).document().version)
// ConstructingParser returns incorrect version value when the the version is "1.1" (and prints "cannot deal with versions != 1.0a"):
assertTrue(ConstructingParser.fromSource(scala.io.Source.fromString(xml11), preserveWS = false).document().version.isEmpty)
// ConstructingParser incorrectly returns null version value when the document does not have the xml declaration:
assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().version)

// XMLLoader returns XML version of the document straight from the `xml` declaration
assertTrue(xercesInternal.getFeature("http://xml.org/sax/features/xml-1.1"))
assertEquals(Some("1.0"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(xml10).version)
assertEquals(Some("1.1"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(xml11).version)
assertEquals(Some("1.0"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(noXmlDeclaration).version)
}

@UnitTest
def xmlEncoding(): Unit = {
val utf8: String = s"""<?xml version="1.0" encoding="UTF-8"?><a/>"""
val utf16: String = s"""<?xml version="1.0" encoding="UTF-16"?><a/>"""
val default: String = s"""<?xml version="1.0"?><a/>"""
val noXmlDeclaration: String = s"""<a/>"""

// ConstructingParser returns XML encoding name canonicalized from the `xml` declaration:
assertEquals(Some("UTF-8" ), ConstructingParser.fromSource(scala.io.Source.fromString(utf8 ), preserveWS = false).document().encoding)
assertEquals(Some("UTF-16"), ConstructingParser.fromSource(scala.io.Source.fromString(utf16 ), preserveWS = false).document().encoding)
assertEquals(None , ConstructingParser.fromSource(scala.io.Source.fromString(default), preserveWS = false).document().encoding)
// ConstructingParser incorrectly returns null encoding value when the document does not have the xml declaration:
assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().encoding)

// XMLLoader does not return the encoding specified in the `xml` declaration:
assertEquals(None, XML.loadStringDocument(utf8).encoding)
assertEquals(None, XML.loadStringDocument(utf16).encoding)
assertEquals(None, XML.loadStringDocument(default).encoding)
assertEquals(None, XML.loadStringDocument(noXmlDeclaration).encoding)

// XMLLoader returns the encoding determined from the Byte Order Mark in the document itself:
assertEquals(Some("UTF-8"), XML.loadDocument(resourceUrl("utf8")).encoding)
assertEquals(Some("UTF-16BE"), XML.loadDocument(resourceUrl("utf16")).encoding)

// ConstructingParser doesn't seem to be able to parse XML with Byte Order Mark:
assertThrows(
classOf[java.nio.charset.MalformedInputException],
() => ConstructingParser.fromSource(scala.io.Source.fromURI(resourceUrl("utf16").toURI), preserveWS = false).document().encoding
)
}

@UnitTest
def loadDtd(): Unit = {
val parserFactory: javax.xml.parsers.SAXParserFactory = xercesExternal
parserFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false)

val xml: String =
s"""<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V5.0//EN" "http://www.oasis-open.org/docbook/xml/5.0/docbook.dtd" [
| <!ELEMENT AnyElement ANY>
| <!ELEMENT EmptyElement EMPTY>
| <!ELEMENT PCDataElement (#PCDATA)>
| <!ELEMENT MixedElement (#PCDATA|element|complex)*>
| <!ELEMENT ChildrenElement (element+,complex?)>
| <!ELEMENT element (#PCDATA)>
| <!ELEMENT complex (#PCDATA)>
| <!ATTLIST complex
| implied CDATA #IMPLIED
| required CDATA #REQUIRED
| fixed CDATA #FIXED "fixed"
| default CDATA "default"
| enumerated (InStock|Backordered|Discontinued) "InStock"
| >
| <!ENTITY AUTHOR "John Doe">
| <!NOTATION jpg PUBLIC "JPG 1.0">
|]>
|<document>&AUTHOR;</document>
|""".stripMargin

val document: Document = XML.withSAXParser(parserFactory.newSAXParser).loadStringDocument(xml)

// XMLLoader parses and returns DTD.
// Note: dtd.ContentModel that DTD uses to represent the element content model lacks fidelity:
// occurrence indicators "?" and "+" can not be expressed.
// Note: spurious parentheses come from the dtd.ContentModel's toString() methods...
assertEquals(
"""DTD PUBLIC "-//OASIS//DTD DocBook V5.0//EN" "http://www.oasis-open.org/docbook/xml/5.0/docbook.dtd" [
|<!ELEMENT AnyElement ANY>
|<!ELEMENT EmptyElement EMPTY>
|<!ELEMENT PCDataElement (#PCDATA)>
|<!ELEMENT MixedElement (#PCDATA|(element|complex))*>
|<!ELEMENT ChildrenElement ((element)*,(complex)*)>
|<!ELEMENT element (#PCDATA)>
|<!ELEMENT complex (#PCDATA)>
|<!ATTLIST complex
| implied CDATA #IMPLIED
| required CDATA #REQUIRED
| fixed CDATA #FIXED "fixed"
| default CDATA "default"
| enumerated (InStock|Backordered|Discontinued) "InStock">
|<!ENTITY AUTHOR "John Doe">
|<!NOTATION jpg PUBLIC "JPG 1.0">
|]""".stripMargin,
document.dtd.toString)

// XMLLoader resolves entities defined in the DTD -
// XML parser parses and uses the DTD internally, so there is no need to install any additional entity resolvers:
assertEquals("""<document>John Doe</document>""", document.docElem.toString)

val document2: Document = ConstructingParser.fromSource(scala.io.Source.fromString(xml), preserveWS = false).document()

// ConstructingParser
// ignores
// element declarations
// attribute list declarations
// some entity declarations
// notations
// captures
// decls: List[Decl] - for EntityDecl and PEReference
// ent: Map[String, EntityDecl]
// returns only
// decls
assertEquals(
s"""DTD PUBLIC "-//OASIS//DTD DocBook V5.0//EN" "http://www.oasis-open.org/docbook/xml/5.0/docbook.dtd" [
|<!ENTITY AUTHOR "John Doe">
|]""".stripMargin,
document2.dtd.toString)

// ConstructingParser resolves entities defined in the DTD
assertEquals("""<document>John Doe</document>""", document2.docElem.toString)
}

@UnitTest
def nodeSeqNs(): Unit = {
Expand Down
2 changes: 1 addition & 1 deletion shared/src/main/scala/scala/xml/dtd/DTD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ abstract class DTD {
var ent: mutable.Map[String, EntityDecl] = new mutable.HashMap[String, EntityDecl]()

override def toString: String =
"DTD [\n%s%s]".format(
"DTD %s [\n%s]".format(
Option(externalID).getOrElse(""),
decls.mkString("", "\n", "\n")
)
Expand Down
3 changes: 2 additions & 1 deletion shared/src/main/scala/scala/xml/dtd/Decl.scala
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,12 @@ case class UnparsedEntityDecl(name: String, extID: ExternalID, notation: String)
extID.buildString(sb).append(" NDATA ").append(notation).append('>')
}
}

/** a notation declaration */
case class NotationDecl(name: String, extID: ExternalID) extends MarkupDecl {
override def buildString(sb: StringBuilder): StringBuilder = {
sb.append("<!NOTATION ").append(name).append(' ')
extID.buildString(sb)
extID.buildString(sb).append('>')
}
}

Expand Down
12 changes: 5 additions & 7 deletions shared/src/main/scala/scala/xml/factory/XMLLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,14 @@ trait XMLLoader[T <: Node] {
* The methods available in scala.xml.XML use the XML parser in the JDK
* (unless another parser is present on the classpath).
*/
private def getDocElem(document: Document): T = document.docElem.asInstanceOf[T]

def loadXML(inputSource: InputSource, parser: SAXParser): T = getDocElem(loadDocument(inputSource, parser))
def loadXMLNodes(inputSource: InputSource, parser: SAXParser): Seq[Node] = loadDocument(inputSource, parser).children

private def loadDocument(inputSource: InputSource, parser: SAXParser): Document = adapter.loadDocument(inputSource, parser)
private def loadDocument(inputSource: InputSource, reader: XMLReader): Document = adapter.loadDocument(inputSource, reader)
// TODO remove
def loadXML(inputSource: InputSource, parser: SAXParser): T = getDocElem(adapter.loadDocument(inputSource, parser.getXMLReader))
def loadXMLNodes(inputSource: InputSource, parser: SAXParser): Seq[Node] = adapter.loadDocument(inputSource, parser.getXMLReader).children
def adapter: parsing.FactoryAdapter = new parsing.NoBindingFactoryAdapter()

/** Loads XML Document. */
def loadDocument(source: InputSource): Document = loadDocument(source, reader)
def loadDocument(inputSource: InputSource): Document = adapter.loadDocument(inputSource, reader)
def loadFileDocument(fileName: String): Document = loadDocument(Source.fromFile(fileName))
def loadFileDocument(file: File): Document = loadDocument(Source.fromFile(file))
def loadDocument(url: URL): Document = loadDocument(Source.fromUrl(url))
Expand All @@ -76,6 +73,7 @@ trait XMLLoader[T <: Node] {
def loadStringDocument(string: String): Document = loadDocument(Source.fromString(string))

/** Loads XML element. */
private def getDocElem(document: Document): T = document.docElem.asInstanceOf[T]
def load(inputSource: InputSource): T = getDocElem(loadDocument(inputSource))
def loadFile(fileName: String): T = getDocElem(loadFileDocument(fileName))
def loadFile(file: File): T = getDocElem(loadFileDocument(file))
Expand Down
Loading