diff --git a/langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java b/langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java new file mode 100644 index 000000000..9c95ffc19 --- /dev/null +++ b/langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hw.langchain.text.splitter; + +import lombok.Data; + +/** + * @author HamaWhite + */ +@Data +public class HeaderType { + + private int level; + + private String name; + + private String data; + + /** + * Create a new HeaderType. + * + * @param level Header level + * @param name Header name + * @param data Header data + */ + public HeaderType(int level, String name, String data) { + this.level = level; + this.name = name; + this.data = data; + } +} diff --git a/langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java b/langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java new file mode 100644 index 000000000..d3ec29d27 --- /dev/null +++ b/langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hw.langchain.text.splitter; + +import lombok.Data; + +import java.util.Map; + +/** + * Line type as typed dict. + * + * @author HamaWhite + */ +@Data +public class LineType { + + private String content; + + private Map metadata; + + /** + * Create a new LineType. + * + * @param content Line content + * @param metadata Line metadata + */ + public LineType(String content, Map metadata) { + this.content = content; + this.metadata = metadata; + } +} diff --git a/langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java b/langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java new file mode 100644 index 000000000..c3a37fee2 --- /dev/null +++ b/langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hw.langchain.text.splitter; + +import com.google.common.collect.Maps; +import com.hw.langchain.schema.Document; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; + +import java.util.*; + +/** + * Implementation of splitting markdown files based on specified headers. + * + * @author HamaWhite + */ +public class MarkdownHeaderTextSplitter { + + /** + * Headers we want to track + */ + private final List> headersToSplitOn; + + /** + * Return each line w/ associated headers + */ + private final boolean returnEachLine; + + public MarkdownHeaderTextSplitter(List> headersToSplitOn) { + this(headersToSplitOn, false); + } + + public MarkdownHeaderTextSplitter(List> headersToSplitOn, boolean returnEachLine) { + // Output line-by-line or aggregated into chunks w/ common headers + this.returnEachLine = returnEachLine; + + // Given the headers we want to split on, (e.g., "#, ##, etc") order by length + this.headersToSplitOn = headersToSplitOn.stream() + .sorted(Comparator.>comparingInt(e -> e.getKey().length()).reversed()) + .toList(); + } + + /** + * Combine lines with common metadata into chunks. + * + * @param lines Line of text / associated header metadata + * @return List of Document chunks + */ + public List aggregateLinesToChunks(List lines) { + List aggregatedChunks = new ArrayList<>(); + + for (LineType line : lines) { + if (!aggregatedChunks.isEmpty() + && (aggregatedChunks.get(aggregatedChunks.size() - 1).getMetadata().equals(line.getMetadata()))) { + // If the last line in the aggregated list has the same metadata as the current line, + // append the current content to the last line's content + LineType lastChunk = aggregatedChunks.get(aggregatedChunks.size() - 1); + lastChunk.setContent(lastChunk.getContent() + " \n" + line.getContent()); + } else { + // Otherwise, append the current line to the aggregated list + aggregatedChunks.add(line); + } + } + return aggregatedChunks.stream() + .map(chunk -> new Document(chunk.getContent(), chunk.getMetadata())) + .toList(); + } + + /** + * Split markdown file. + * + * @param text Markdown file + * @return List of Document chunks + */ + public List splitText(String text) { + List linesWithMetadata = new ArrayList<>(); + // Content and metadata of the chunk currently being processed + List currentContent = new ArrayList<>(); + Map currentMetadata = Maps.newHashMap(); + // Keep track of the nested header structure + List headerStack = new ArrayList<>(); + Map initialMetadata = Maps.newHashMap(); + + // Split the input text by newline character ("\n"). + String[] lines = text.split("\n"); + for (String line : lines) { + String strippedLine = line.strip(); + // Check each line against each of the header types (e.g., #, ##) + boolean foundHeader = processLine(strippedLine, linesWithMetadata, currentContent, currentMetadata, + headerStack, initialMetadata); + + if (!foundHeader && !strippedLine.isEmpty()) { + currentContent.add(strippedLine); + } else if (!currentContent.isEmpty()) { + linesWithMetadata.add(new LineType(String.join("\n", currentContent), new HashMap<>(currentMetadata))); + currentContent.clear(); + } + currentMetadata = new HashMap<>(initialMetadata); + } + + return processOutput(linesWithMetadata, currentContent, currentMetadata); + } + + private boolean processLine(String strippedLine, List linesWithMetadata, List currentContent, + Map currentMetadata, List headerStack, Map initialMetadata) { + for (Pair pair : headersToSplitOn) { + String sep = pair.getLeft(); + String name = pair.getValue(); + if (isHeaderToSplitOn(strippedLine, sep)) { + // Ensure we are tracking the header as metadata + if (name != null) { + // Get the current header level + int currentHeaderLevel = StringUtils.countMatches(sep, "#"); + // Pop out headers of lower or same level from the stack + while (!headerStack.isEmpty() + && headerStack.get(headerStack.size() - 1).getLevel() >= currentHeaderLevel) { + // We have encountered a new header at the same or higher level + HeaderType poppedHeader = headerStack.remove(headerStack.size() - 1); + // Clear the metadata for the popped header in initialMetadata + initialMetadata.remove(poppedHeader.getName()); + } + // Push the current header to the stack + HeaderType header = + new HeaderType(currentHeaderLevel, name, strippedLine.substring(sep.length()).strip()); + headerStack.add(header); + // Update initialMetadata with the current header + initialMetadata.put(name, header.getData()); + } + // Add the previous line to the linesWithMetadata only if currentContent is not empty + if (!currentContent.isEmpty()) { + linesWithMetadata + .add(new LineType(String.join("\n", currentContent), new HashMap<>(currentMetadata))); + currentContent.clear(); + } + return true; + } + } + return false; + } + + /** + * Check if line starts with a header that we intend to split on. + * Header with no text OR header is followed by space Both are valid conditions that sep is being used a header. + */ + private boolean isHeaderToSplitOn(String strippedLine, String sep) { + return strippedLine.startsWith(sep) && + (strippedLine.length() == sep.length() || strippedLine.charAt(sep.length()) == ' '); + } + + private List processOutput(List linesWithMetadata, List currentContent, + Map currentMetadata) { + if (!currentContent.isEmpty()) { + linesWithMetadata.add(new LineType(String.join("\n", currentContent), currentMetadata)); + } + // linesWithMetadata has each line with associated header metadata aggregate these into chunks based on common + // metadata + if (!returnEachLine) { + return aggregateLinesToChunks(linesWithMetadata); + } else { + return linesWithMetadata.stream() + .map(chunk -> new Document(chunk.getContent(), chunk.getMetadata())) + .toList(); + } + } +} diff --git a/langchain-core/src/test/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitterTest.java b/langchain-core/src/test/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitterTest.java new file mode 100644 index 000000000..400ce7eea --- /dev/null +++ b/langchain-core/src/test/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitterTest.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hw.langchain.text.splitter; + +import com.hw.langchain.schema.Document; + +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * @author HamaWhite + */ +class MarkdownHeaderTextSplitterTest { + + /** + * Test markdown splitter by header: Case 1. + */ + @Test + void testMdHeaderTextSplitter1() { + String markdownDocument = """ + # Foo + + ## Bar + + Hi this is Jim + + Hi this is Joe + + ## Baz + + Hi this is Molly + """; + + List> headersToSplitOn = List.of( + Pair.of("#", "Header 1"), + Pair.of("##", "Header 2")); + + MarkdownHeaderTextSplitter markdownSplitter = new MarkdownHeaderTextSplitter(headersToSplitOn); + List output = markdownSplitter.splitText(markdownDocument); + + List expectedOutput = List.of( + new Document( + "Hi this is Jim \nHi this is Joe", + Map.of("Header 1", "Foo", "Header 2", "Bar")), + new Document( + "Hi this is Molly", + Map.of("Header 1", "Foo", "Header 2", "Baz"))); + assertEquals(expectedOutput, output); + } + + /** + * Test markdown splitter by header: Case 2. + */ + @Test + void testMdHeaderTextSplitter2() { + String markdownDocument = """ + # Foo + + ## Bar + + Hi this is Jim + + Hi this is Joe + + ### Boo + + Hi this is Lance + + ## Baz + + Hi this is Molly + """; + + List> headersToSplitOn = List.of( + Pair.of("#", "Header 1"), + Pair.of("##", "Header 2"), + Pair.of("###", "Header 3")); + + MarkdownHeaderTextSplitter markdownSplitter = new MarkdownHeaderTextSplitter(headersToSplitOn); + List output = markdownSplitter.splitText(markdownDocument); + + List expectedOutput = List.of( + new Document( + "Hi this is Jim \nHi this is Joe", + Map.of("Header 1", "Foo", "Header 2", "Bar")), + new Document( + "Hi this is Lance", + Map.of("Header 1", "Foo", "Header 2", "Bar", "Header 3", "Boo")), + new Document( + "Hi this is Molly", + Map.of("Header 1", "Foo", "Header 2", "Baz"))); + assertEquals(expectedOutput, output); + } + + /** + * Test markdown splitter by header: Case 3. + */ + @Test + void testMdHeaderTextSplitter3() { + String markdownDocument = """ + # Foo + + ## Bar + + Hi this is Jim + + Hi this is Joe + + ### Boo + + Hi this is Lance + + #### Bim + + Hi this is John + + ## Baz + + Hi this is Molly + """; + + List> headersToSplitOn = List.of( + Pair.of("#", "Header 1"), + Pair.of("##", "Header 2"), + Pair.of("###", "Header 3"), + Pair.of("####", "Header 4")); + + MarkdownHeaderTextSplitter markdownSplitter = new MarkdownHeaderTextSplitter(headersToSplitOn); + List output = markdownSplitter.splitText(markdownDocument); + + List expectedOutput = List.of( + new Document( + "Hi this is Jim \nHi this is Joe", + Map.of("Header 1", "Foo", "Header 2", "Bar")), + new Document( + "Hi this is Lance", + Map.of("Header 1", "Foo", "Header 2", "Bar", "Header 3", "Boo")), + new Document( + "Hi this is John", + Map.of("Header 1", "Foo", "Header 2", "Bar", "Header 3", "Boo", "Header 4", "Bim")), + new Document( + "Hi this is Molly", + Map.of("Header 1", "Foo", "Header 2", "Baz"))); + assertEquals(expectedOutput, output); + } +} \ No newline at end of file