From 1a55dfd3d9e0519e0d1142e9c9f7e811f0711529 Mon Sep 17 00:00:00 2001 From: Per Landberg Date: Fri, 5 May 2023 11:28:01 +0200 Subject: [PATCH 1/5] Add testcase and fix for form attributes deduplication This fixes #1949 --- src/main/java/org/jsoup/parser/HtmlTreeBuilder.java | 8 ++++++++ src/test/java/org/jsoup/parser/HtmlParserTest.java | 12 ++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 06e9c749b2..09f1d8bcc6 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -282,6 +282,14 @@ Element insertEmpty(Token.StartTag startTag) { } FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { + // cleanup duplicate attributes: + if (startTag.hasAttributes() && !startTag.attributes.isEmpty()) { + int dupes = startTag.attributes.deduplicate(settings); + if (dupes > 0) { + error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); + } + } + Tag tag = tagFor(startTag.name(), settings); FormElement el = new FormElement(tag, null, settings.normalizeAttributes(startTag.attributes)); if (checkTemplateStack) { diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index e7f5c23a41..fb8e20ac51 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -58,6 +58,18 @@ public class HtmlParserTest { assertEquals("Dropped duplicate attribute(s) in tag [p]", parser.getErrors().get(0).getErrorMessage()); } + @Test public void dropsDuplicateAttributesInFormElement() { + String html = "
"; + Parser parser = Parser.htmlParser().setTrackErrors(10); + Document doc = parser.parseInput(html, ""); + + Element p = doc.selectFirst("form"); + assertEquals("
", p.outerHtml()); // normalized names due to lower casing + + assertEquals(1, parser.getErrors().size()); + assertEquals("Dropped duplicate attribute(s) in tag [form]", parser.getErrors().get(0).getErrorMessage()); + } + @Test public void retainsAttributesOfDifferentCaseIfSensitive() { String html = "

Text

"; Parser parser = Parser.htmlParser().settings(preserveCase); From f27882541a6297c01af629aba3617f71b01e8d39 Mon Sep 17 00:00:00 2001 From: Per Landberg Date: Sat, 6 May 2023 17:20:05 +0200 Subject: [PATCH 2/5] Factor out attribute deduplication to it's own private method Also add new testcase and fix for attributes deduplication in empty tags --- .../org/jsoup/parser/HtmlTreeBuilder.java | 33 ++++++++++--------- .../java/org/jsoup/parser/HtmlParserTest.java | 12 +++++++ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 09f1d8bcc6..2d832de443 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -10,6 +10,7 @@ import org.jsoup.nodes.FormElement; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; +import org.jsoup.parser.Token.StartTag; import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; @@ -227,15 +228,9 @@ void error(HtmlTreeBuilderState state) { } Element insert(final Token.StartTag startTag) { - // cleanup duplicate attributes: - if (startTag.hasAttributes() && !startTag.attributes.isEmpty()) { - int dupes = startTag.attributes.deduplicate(settings); - if (dupes > 0) { - error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); - } - } + dedupliateTagAttributes(startTag); - // handle empty unknown tags + // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. if (startTag.isSelfClosing()) { Element el = insertEmpty(startTag); @@ -250,7 +245,7 @@ Element insert(final Token.StartTag startTag) { return el; } - Element insertStartTag(String startTagName) { + Element insertStartTag(String startTagName) { Element el = new Element(tagFor(startTagName, settings), null); insert(el); return el; @@ -267,6 +262,8 @@ private void insert(Element el, @Nullable Token token) { } Element insertEmpty(Token.StartTag startTag) { + dedupliateTagAttributes(startTag); + Tag tag = tagFor(startTag.name(), settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); insertNode(el, startTag); @@ -282,13 +279,7 @@ Element insertEmpty(Token.StartTag startTag) { } FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { - // cleanup duplicate attributes: - if (startTag.hasAttributes() && !startTag.attributes.isEmpty()) { - int dupes = startTag.attributes.deduplicate(settings); - if (dupes > 0) { - error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); - } - } + dedupliateTagAttributes(startTag); Tag tag = tagFor(startTag.name(), settings); FormElement el = new FormElement(tag, null, settings.normalizeAttributes(startTag.attributes)); @@ -348,6 +339,16 @@ else if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), onNodeInserted(node, token); } + /** Cleanup duplicate attributes. **/ + private void dedupliateTagAttributes(StartTag startTag) { + if (startTag.hasAttributes() && !startTag.attributes.isEmpty()) { + int dupes = startTag.attributes.deduplicate(settings); + if (dupes > 0) { + error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); + } + } + } + Element pop() { int size = stack.size(); return stack.remove(size-1); diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index fb8e20ac51..5ee702dbd1 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -58,6 +58,18 @@ public class HtmlParserTest { assertEquals("Dropped duplicate attribute(s) in tag [p]", parser.getErrors().get(0).getErrorMessage()); } + @Test public void dropsDuplicateAttributesInEmptyElement() { + String html = "Text>"; + Parser parser = Parser.htmlParser().setTrackErrors(10); + Document doc = parser.parseInput(html, ""); + + Element p = doc.selectFirst("img"); + assertEquals("", p.outerHtml()); // normalized names due to lower casing + + assertEquals(1, parser.getErrors().size()); + assertEquals("Dropped duplicate attribute(s) in tag [img]", parser.getErrors().get(0).getErrorMessage()); + } + @Test public void dropsDuplicateAttributesInFormElement() { String html = "
"; Parser parser = Parser.htmlParser().setTrackErrors(10); From a535795d09210bc5b04ffb4efdae8a3d51abab04 Mon Sep 17 00:00:00 2001 From: Per Landberg Date: Sat, 6 May 2023 17:31:33 +0200 Subject: [PATCH 3/5] Fix formatting in added code To use spaces instead of tabs as rest of the file --- src/main/java/org/jsoup/parser/HtmlTreeBuilder.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 2d832de443..35d1a190f1 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -228,9 +228,9 @@ void error(HtmlTreeBuilderState state) { } Element insert(final Token.StartTag startTag) { - dedupliateTagAttributes(startTag); + dedupliateTagAttributes(startTag); - // handle empty unknown tags + // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. if (startTag.isSelfClosing()) { Element el = insertEmpty(startTag); @@ -262,7 +262,7 @@ private void insert(Element el, @Nullable Token token) { } Element insertEmpty(Token.StartTag startTag) { - dedupliateTagAttributes(startTag); + dedupliateTagAttributes(startTag); Tag tag = tagFor(startTag.name(), settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); @@ -279,7 +279,7 @@ Element insertEmpty(Token.StartTag startTag) { } FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { - dedupliateTagAttributes(startTag); + dedupliateTagAttributes(startTag); Tag tag = tagFor(startTag.name(), settings); FormElement el = new FormElement(tag, null, settings.normalizeAttributes(startTag.attributes)); @@ -346,8 +346,8 @@ private void dedupliateTagAttributes(StartTag startTag) { if (dupes > 0) { error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); } - } - } + } + } Element pop() { int size = stack.size(); From f28ade2b23ca77cfc8e3fad839b70b42fc16dc65 Mon Sep 17 00:00:00 2001 From: Per Landberg Date: Sat, 6 May 2023 17:34:57 +0200 Subject: [PATCH 4/5] Cleanup test html in dropsDuplicateAttributesInEmptyElement --- src/test/java/org/jsoup/parser/HtmlParserTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 5ee702dbd1..465f1cdf9c 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -59,7 +59,7 @@ public class HtmlParserTest { } @Test public void dropsDuplicateAttributesInEmptyElement() { - String html = "Text>"; + String html = ""; Parser parser = Parser.htmlParser().setTrackErrors(10); Document doc = parser.parseInput(html, ""); From 5d5a410e3fd42c614883fdbc7f982b93e8c89477 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 8 May 2023 18:32:10 +1000 Subject: [PATCH 5/5] Simplified dupe tests and method name --- .../org/jsoup/parser/HtmlTreeBuilder.java | 8 ++-- .../java/org/jsoup/parser/HtmlParserTest.java | 44 +++++++------------ 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 35d1a190f1..be0498c5ae 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -228,7 +228,7 @@ void error(HtmlTreeBuilderState state) { } Element insert(final Token.StartTag startTag) { - dedupliateTagAttributes(startTag); + dedupeAttributes(startTag); // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. @@ -262,7 +262,7 @@ private void insert(Element el, @Nullable Token token) { } Element insertEmpty(Token.StartTag startTag) { - dedupliateTagAttributes(startTag); + dedupeAttributes(startTag); Tag tag = tagFor(startTag.name(), settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); @@ -279,7 +279,7 @@ Element insertEmpty(Token.StartTag startTag) { } FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { - dedupliateTagAttributes(startTag); + dedupeAttributes(startTag); Tag tag = tagFor(startTag.name(), settings); FormElement el = new FormElement(tag, null, settings.normalizeAttributes(startTag.attributes)); @@ -340,7 +340,7 @@ else if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), } /** Cleanup duplicate attributes. **/ - private void dedupliateTagAttributes(StartTag startTag) { + private void dedupeAttributes(StartTag startTag) { if (startTag.hasAttributes() && !startTag.attributes.isEmpty()) { int dupes = startTag.attributes.deduplicate(settings); if (dupes > 0) { diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 465f1cdf9c..4172d0f59e 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -7,13 +7,16 @@ import org.jsoup.nodes.*; import org.jsoup.safety.Safelist; import org.jsoup.select.Elements; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.util.List; +import java.util.stream.Stream; import static org.jsoup.parser.ParseSettings.preserveCase; import static org.junit.jupiter.api.Assertions.*; @@ -46,40 +49,25 @@ public class HtmlParserTest { assertEquals("foo > bar", p.attr("class")); } - @Test public void dropsDuplicateAttributes() { - String html = "

Text

"; + @ParameterizedTest @MethodSource("dupeAttributeData") + public void dropsDuplicateAttributes(String html, String expected) { Parser parser = Parser.htmlParser().setTrackErrors(10); Document doc = parser.parseInput(html, ""); - Element p = doc.selectFirst("p"); - assertEquals("

Text

", p.outerHtml()); // normalized names due to lower casing + Element el = doc.expectFirst("body > *"); + assertEquals(expected, el.outerHtml()); // normalized names due to lower casing + String tag = el.normalName(); assertEquals(1, parser.getErrors().size()); - assertEquals("Dropped duplicate attribute(s) in tag [p]", parser.getErrors().get(0).getErrorMessage()); + assertEquals("Dropped duplicate attribute(s) in tag [" + tag + "]", parser.getErrors().get(0).getErrorMessage()); } - @Test public void dropsDuplicateAttributesInEmptyElement() { - String html = ""; - Parser parser = Parser.htmlParser().setTrackErrors(10); - Document doc = parser.parseInput(html, ""); - - Element p = doc.selectFirst("img"); - assertEquals("", p.outerHtml()); // normalized names due to lower casing - - assertEquals(1, parser.getErrors().size()); - assertEquals("Dropped duplicate attribute(s) in tag [img]", parser.getErrors().get(0).getErrorMessage()); - } - - @Test public void dropsDuplicateAttributesInFormElement() { - String html = "
"; - Parser parser = Parser.htmlParser().setTrackErrors(10); - Document doc = parser.parseInput(html, ""); - - Element p = doc.selectFirst("form"); - assertEquals("
", p.outerHtml()); // normalized names due to lower casing - - assertEquals(1, parser.getErrors().size()); - assertEquals("Dropped duplicate attribute(s) in tag [form]", parser.getErrors().get(0).getErrorMessage()); + private static Stream dupeAttributeData() { + return Stream.of( + Arguments.of("

Text

", "

Text

"), + Arguments.of("", ""), + Arguments.of("
", "
") + ); } @Test public void retainsAttributesOfDifferentCaseIfSensitive() {