Skip to content

Commit

Permalink
Progressive parsing with StreamParser (#2096)
Browse files Browse the repository at this point in the history
A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if applicable.
  • Loading branch information
jhy authored Jan 5, 2024
1 parent 5ae722c commit 2b443df
Show file tree
Hide file tree
Showing 15 changed files with 1,061 additions and 116 deletions.
10 changes: 10 additions & 0 deletions src/main/java/org/jsoup/Connection.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.jsoup.helper.RequestAuthenticator;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.parser.StreamParser;
import org.jspecify.annotations.Nullable;

import javax.net.ssl.SSLSocketFactory;
Expand Down Expand Up @@ -883,6 +884,15 @@ <p>Other body methods (like bufferUp, body, parse, etc) will generally not work
@return the response body input stream
*/
BufferedInputStream bodyStream();

/**
Returns a {@link StreamParser} that will parse the Response progressively.
* @return a StreamParser, prepared to parse this response.
* @throws IOException if an IO exception occurs preparing the parser.
*/
default StreamParser streamParser() throws IOException {
throw new UnsupportedOperationException();
}
}

/**
Expand Down
245 changes: 163 additions & 82 deletions src/main/java/org/jsoup/helper/DataUtil.java

Large diffs are not rendered by default.

32 changes: 30 additions & 2 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.parser.StreamParser;
import org.jsoup.parser.TokenQueue;
import org.jspecify.annotations.Nullable;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSocketFactory;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.CookieManager;
Expand Down Expand Up @@ -950,22 +953,47 @@ public String contentType() {
return contentType;
}

public Document parse() throws IOException {
/** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */
private InputStream prepareParse() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
InputStream stream = bodyStream;
if (byteData != null) { // bytes have been read in to the buffer, parse that
stream = new ByteArrayInputStream(byteData.array());
inputStreamRead = false; // ok to reparse if in bytes
}
Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read.");
Validate.notNull(stream);
inputStreamRead = true;
return stream;
}

@Override public Document parse() throws IOException {
InputStream stream = prepareParse();
Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser());
doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req?
charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
inputStreamRead = true;
safeClose();
return doc;
}

@Override public StreamParser streamParser() throws IOException {
InputStream stream = prepareParse();
String baseUri = url.toExternalForm();
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
// note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.

// set up the stream parser and rig this connection up to the parsed doc:
StreamParser streamer = new StreamParser(req.parser());
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
DataUtil.maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
streamer.document().connection(new HttpConnection(req, this));
charset = charsetDoc.charset.name();

// we don't safeClose() as in parse(); caller must close streamParser to close InputStream stream
return streamer;
}

private void prepareByteData() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body");
if (bodyStream != null && byteData == null) {
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jsoup/internal/ControllableInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ public int read(byte[] b, int off, int len) throws IOException {
remaining -= read;
return read;
} catch (SocketTimeoutException e) {
if (expired())
throw e;
return 0;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/CharacterReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public final class CharacterReader {

public CharacterReader(Reader input, int sz) {
Validate.notNull(input);
Validate.isTrue(input.markSupported());
Validate.isTrue(input.markSupported(), "The supplied Reader must support mark(), but does not.");
reader = input;
charBuf = new char[Math.min(sz, maxBufferLen)];
bufferUp();
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,9 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
fragmentParsing = false;
}

@Override List<Node> parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) {
@Override List<Node> doParseFragment(@Nullable Element context) {
// context may be null
state = HtmlTreeBuilderState.Initial;
initialiseParse(new StringReader(inputFragment), baseUri, parser);
contextElement = context;
fragmentParsing = true;
Element root = null;
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ public static List<Node> parseFragment(String fragmentHtml, Element context, Str
*/
public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder));
return treeBuilder.parseFragment(fragmentXml, null, baseUri, new Parser(treeBuilder));
}

/**
Expand Down
Loading

0 comments on commit 2b443df

Please sign in to comment.