From 7206dea70340f7db4614033c82b0667d5a95c416 Mon Sep 17 00:00:00 2001
From: HamaWhite <baisongxx@gmail.com>
Date: Fri, 7 Jul 2023 14:21:09 +0800
Subject: [PATCH 1/3] optimize

---
 README.md                                                       | 2 +-
 .../com/hw/langchain/examples/chains/RetrievalQaExample.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 023adf582..726d3c8af 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ var llm = OpenAI.builder()
         .openaiOrganization("xxx")
         .openaiApiKey("xxx")
         .openaiProxy("http://host:port")
-        .requestTimeout(requestTimeout)
+        .requestTimeout(16)
         .build()
         .init();
 ```
diff --git a/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java b/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java
index 395172658..3bbb6fb2a 100644
--- a/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java
+++ b/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java
@@ -66,7 +66,7 @@ public static void main(String[] args) {
         var llm = OpenAI.builder().temperature(0).requestTimeout(30).build().init();
         var qa = RetrievalQA.fromChainType(llm, STUFF, pinecone.asRetriever());
 
-        String query = "What did the president say about Ketanji Brown Jackson";
+        var query = "What did the president say about Ketanji Brown Jackson";
         var result = qa.run(query);
         println(result);
     }

From 3b6587bba522d3f99c7e37669fa39035bc2c21c8 Mon Sep 17 00:00:00 2001
From: HamaWhite <baisongxx@gmail.com>
Date: Fri, 7 Jul 2023 16:01:09 +0800
Subject: [PATCH 2/3] optimize

---
 .../com/hw/langchain/vectorstores/pinecone/PineconeTest.java     | 1 +
 .../com/hw/langchain/examples/chains/RetrievalQaExample.java     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/langchain-core/src/test/java/com/hw/langchain/vectorstores/pinecone/PineconeTest.java b/langchain-core/src/test/java/com/hw/langchain/vectorstores/pinecone/PineconeTest.java
index d60ef7673..bacd98ec3 100644
--- a/langchain-core/src/test/java/com/hw/langchain/vectorstores/pinecone/PineconeTest.java
+++ b/langchain-core/src/test/java/com/hw/langchain/vectorstores/pinecone/PineconeTest.java
@@ -81,6 +81,7 @@ private Pinecone createPinecone() {
      */
     private void ensureIndexCreated() {
         if (!client.listIndexes().contains(INDEX_NAME)) {
+            // the text-embedding-ada-002 model has an output dimension of 1536.
             var request = CreateIndexRequest.builder()
                     .name(INDEX_NAME)
                     .dimension(1536)
diff --git a/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java b/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java
index 3bbb6fb2a..5afaf8c96 100644
--- a/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java
+++ b/langchain-examples/src/main/java/com/hw/langchain/examples/chains/RetrievalQaExample.java
@@ -77,6 +77,7 @@ public static void main(String[] args) {
      */
     private static void createPineconeIndex(PineconeClient client) {
         if (!client.listIndexes().contains(INDEX_NAME)) {
+            // the text-embedding-ada-002 model has an output dimension of 1536.
             var request = CreateIndexRequest.builder()
                     .name(INDEX_NAME)
                     .dimension(1536)

From 0d048e9918839db05ee6729f0da0d60cd98adf22 Mon Sep 17 00:00:00 2001
From: HamaWhite <baisongxx@gmail.com>
Date: Sat, 8 Jul 2023 01:26:48 +0800
Subject: [PATCH 3/3] Support MarkdownHeaderTextSplitter

---
 .../langchain/text/splitter/HeaderType.java   |  47 +++++
 .../hw/langchain/text/splitter/LineType.java  |  47 +++++
 .../splitter/MarkdownHeaderTextSplitter.java  | 182 ++++++++++++++++++
 .../MarkdownHeaderTextSplitterTest.java       | 167 ++++++++++++++++
 4 files changed, 443 insertions(+)
 create mode 100644 langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java
 create mode 100644 langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java
 create mode 100644 langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java
 create mode 100644 langchain-core/src/test/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitterTest.java

diff --git a/langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java b/langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java
new file mode 100644
index 000000000..9c95ffc19
--- /dev/null
+++ b/langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.hw.langchain.text.splitter;
+
+import lombok.Data;
+
+/**
+ * @author HamaWhite
+ */
+@Data
+public class HeaderType {
+
+    private int level;
+
+    private String name;
+
+    private String data;
+
+    /**
+     * Create a new HeaderType.
+     *
+     * @param level Header level
+     * @param name  Header name
+     * @param data  Header data
+     */
+    public HeaderType(int level, String name, String data) {
+        this.level = level;
+        this.name = name;
+        this.data = data;
+    }
+}
diff --git a/langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java b/langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java
new file mode 100644
index 000000000..d3ec29d27
--- /dev/null
+++ b/langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.hw.langchain.text.splitter;
+
+import lombok.Data;
+
+import java.util.Map;
+
+/**
+ * Line type as typed dict.
+ *
+ * @author HamaWhite
+ */
+@Data
+public class LineType {
+
+    private String content;
+
+    private Map<String, Object> metadata;
+
+    /**
+     * Create a new LineType.
+     *
+     * @param content  Line content
+     * @param metadata Line metadata
+     */
+    public LineType(String content, Map<String, Object> metadata) {
+        this.content = content;
+        this.metadata = metadata;
+    }
+}
diff --git a/langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java b/langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java
new file mode 100644
index 000000000..c3a37fee2
--- /dev/null
+++ b/langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.hw.langchain.text.splitter;
+
+import com.google.common.collect.Maps;
+import com.hw.langchain.schema.Document;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.tuple.Pair;
+
+import java.util.*;
+
+/**
+ * Implementation of splitting markdown files based on specified headers.
+ *
+ * @author HamaWhite
+ */
+public class MarkdownHeaderTextSplitter {
+
+    /**
+     * Headers we want to track
+     */
+    private final List<Pair<String, String>> headersToSplitOn;
+
+    /**
+     * Return each line w/ associated headers
+     */
+    private final boolean returnEachLine;
+
+    public MarkdownHeaderTextSplitter(List<Pair<String, String>> headersToSplitOn) {
+        this(headersToSplitOn, false);
+    }
+
+    public MarkdownHeaderTextSplitter(List<Pair<String, String>> headersToSplitOn, boolean returnEachLine) {
+        // Output line-by-line or aggregated into chunks w/ common headers
+        this.returnEachLine = returnEachLine;
+
+        // Given the headers we want to split on, (e.g., "#, ##, etc") order by length
+        this.headersToSplitOn = headersToSplitOn.stream()
+                .sorted(Comparator.<Pair<String, String>>comparingInt(e -> e.getKey().length()).reversed())
+                .toList();
+    }
+
+    /**
+     * Combine lines with common metadata into chunks.
+     *
+     * @param lines Line of text / associated header metadata
+     * @return List of Document chunks
+     */
+    public List<Document> aggregateLinesToChunks(List<LineType> lines) {
+        List<LineType> aggregatedChunks = new ArrayList<>();
+
+        for (LineType line : lines) {
+            if (!aggregatedChunks.isEmpty()
+                    && (aggregatedChunks.get(aggregatedChunks.size() - 1).getMetadata().equals(line.getMetadata()))) {
+                // If the last line in the aggregated list has the same metadata as the current line,
+                // append the current content to the last line's content
+                LineType lastChunk = aggregatedChunks.get(aggregatedChunks.size() - 1);
+                lastChunk.setContent(lastChunk.getContent() + "  \n" + line.getContent());
+            } else {
+                // Otherwise, append the current line to the aggregated list
+                aggregatedChunks.add(line);
+            }
+        }
+        return aggregatedChunks.stream()
+                .map(chunk -> new Document(chunk.getContent(), chunk.getMetadata()))
+                .toList();
+    }
+
+    /**
+     * Split markdown file.
+     *
+     * @param text Markdown file
+     * @return List of Document chunks
+     */
+    public List<Document> splitText(String text) {
+        List<LineType> linesWithMetadata = new ArrayList<>();
+        // Content and metadata of the chunk currently being processed
+        List<String> currentContent = new ArrayList<>();
+        Map<String, Object> currentMetadata = Maps.newHashMap();
+        // Keep track of the nested header structure
+        List<HeaderType> headerStack = new ArrayList<>();
+        Map<String, String> initialMetadata = Maps.newHashMap();
+
+        // Split the input text by newline character ("\n").
+        String[] lines = text.split("\n");
+        for (String line : lines) {
+            String strippedLine = line.strip();
+            // Check each line against each of the header types (e.g., #, ##)
+            boolean foundHeader = processLine(strippedLine, linesWithMetadata, currentContent, currentMetadata,
+                    headerStack, initialMetadata);
+
+            if (!foundHeader && !strippedLine.isEmpty()) {
+                currentContent.add(strippedLine);
+            } else if (!currentContent.isEmpty()) {
+                linesWithMetadata.add(new LineType(String.join("\n", currentContent), new HashMap<>(currentMetadata)));
+                currentContent.clear();
+            }
+            currentMetadata = new HashMap<>(initialMetadata);
+        }
+
+        return processOutput(linesWithMetadata, currentContent, currentMetadata);
+    }
+
+    private boolean processLine(String strippedLine, List<LineType> linesWithMetadata, List<String> currentContent,
+            Map<String, Object> currentMetadata, List<HeaderType> headerStack, Map<String, String> initialMetadata) {
+        for (Pair<String, String> pair : headersToSplitOn) {
+            String sep = pair.getLeft();
+            String name = pair.getValue();
+            if (isHeaderToSplitOn(strippedLine, sep)) {
+                // Ensure we are tracking the header as metadata
+                if (name != null) {
+                    // Get the current header level
+                    int currentHeaderLevel = StringUtils.countMatches(sep, "#");
+                    // Pop out headers of lower or same level from the stack
+                    while (!headerStack.isEmpty()
+                            && headerStack.get(headerStack.size() - 1).getLevel() >= currentHeaderLevel) {
+                        // We have encountered a new header at the same or higher level
+                        HeaderType poppedHeader = headerStack.remove(headerStack.size() - 1);
+                        // Clear the metadata for the popped header in initialMetadata
+                        initialMetadata.remove(poppedHeader.getName());
+                    }
+                    // Push the current header to the stack
+                    HeaderType header =
+                            new HeaderType(currentHeaderLevel, name, strippedLine.substring(sep.length()).strip());
+                    headerStack.add(header);
+                    // Update initialMetadata with the current header
+                    initialMetadata.put(name, header.getData());
+                }
+                // Add the previous line to the linesWithMetadata only if currentContent is not empty
+                if (!currentContent.isEmpty()) {
+                    linesWithMetadata
+                            .add(new LineType(String.join("\n", currentContent), new HashMap<>(currentMetadata)));
+                    currentContent.clear();
+                }
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Check if line starts with a header that we intend to split on.
+     * Header with no text OR header is followed by space Both are valid conditions that sep is being used a header.
+     */
+    private boolean isHeaderToSplitOn(String strippedLine, String sep) {
+        return strippedLine.startsWith(sep) &&
+                (strippedLine.length() == sep.length() || strippedLine.charAt(sep.length()) == ' ');
+    }
+
+    private List<Document> processOutput(List<LineType> linesWithMetadata, List<String> currentContent,
+            Map<String, Object> currentMetadata) {
+        if (!currentContent.isEmpty()) {
+            linesWithMetadata.add(new LineType(String.join("\n", currentContent), currentMetadata));
+        }
+        // linesWithMetadata has each line with associated header metadata aggregate these into chunks based on common
+        // metadata
+        if (!returnEachLine) {
+            return aggregateLinesToChunks(linesWithMetadata);
+        } else {
+            return linesWithMetadata.stream()
+                    .map(chunk -> new Document(chunk.getContent(), chunk.getMetadata()))
+                    .toList();
+        }
+    }
+}
diff --git a/langchain-core/src/test/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitterTest.java b/langchain-core/src/test/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitterTest.java
new file mode 100644
index 000000000..400ce7eea
--- /dev/null
+++ b/langchain-core/src/test/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitterTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.hw.langchain.text.splitter;
+
+import com.hw.langchain.schema.Document;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * @author HamaWhite
+ */
+class MarkdownHeaderTextSplitterTest {
+
+    /**
+     * Test markdown splitter by header: Case 1.
+     */
+    @Test
+    void testMdHeaderTextSplitter1() {
+        String markdownDocument = """
+                # Foo
+
+                    ## Bar
+
+                Hi this is Jim
+
+                Hi this is Joe
+
+                 ## Baz
+
+                 Hi this is Molly
+                """;
+
+        List<Pair<String, String>> headersToSplitOn = List.of(
+                Pair.of("#", "Header 1"),
+                Pair.of("##", "Header 2"));
+
+        MarkdownHeaderTextSplitter markdownSplitter = new MarkdownHeaderTextSplitter(headersToSplitOn);
+        List<Document> output = markdownSplitter.splitText(markdownDocument);
+
+        List<Document> expectedOutput = List.of(
+                new Document(
+                        "Hi this is Jim  \nHi this is Joe",
+                        Map.of("Header 1", "Foo", "Header 2", "Bar")),
+                new Document(
+                        "Hi this is Molly",
+                        Map.of("Header 1", "Foo", "Header 2", "Baz")));
+        assertEquals(expectedOutput, output);
+    }
+
+    /**
+     * Test markdown splitter by header: Case 2.
+     */
+    @Test
+    void testMdHeaderTextSplitter2() {
+        String markdownDocument = """
+                # Foo
+
+                    ## Bar
+
+                Hi this is Jim
+
+                Hi this is Joe
+
+                 ### Boo
+
+                 Hi this is Lance
+
+                 ## Baz
+
+                 Hi this is Molly
+                """;
+
+        List<Pair<String, String>> headersToSplitOn = List.of(
+                Pair.of("#", "Header 1"),
+                Pair.of("##", "Header 2"),
+                Pair.of("###", "Header 3"));
+
+        MarkdownHeaderTextSplitter markdownSplitter = new MarkdownHeaderTextSplitter(headersToSplitOn);
+        List<Document> output = markdownSplitter.splitText(markdownDocument);
+
+        List<Document> expectedOutput = List.of(
+                new Document(
+                        "Hi this is Jim  \nHi this is Joe",
+                        Map.of("Header 1", "Foo", "Header 2", "Bar")),
+                new Document(
+                        "Hi this is Lance",
+                        Map.of("Header 1", "Foo", "Header 2", "Bar", "Header 3", "Boo")),
+                new Document(
+                        "Hi this is Molly",
+                        Map.of("Header 1", "Foo", "Header 2", "Baz")));
+        assertEquals(expectedOutput, output);
+    }
+
+    /**
+     * Test markdown splitter by header: Case 3.
+     */
+    @Test
+    void testMdHeaderTextSplitter3() {
+        String markdownDocument = """
+                # Foo
+
+                    ## Bar
+
+                Hi this is Jim
+
+                Hi this is Joe
+
+                 ### Boo
+
+                 Hi this is Lance
+
+                 #### Bim
+
+                 Hi this is John
+
+                 ## Baz
+
+                 Hi this is Molly
+                """;
+
+        List<Pair<String, String>> headersToSplitOn = List.of(
+                Pair.of("#", "Header 1"),
+                Pair.of("##", "Header 2"),
+                Pair.of("###", "Header 3"),
+                Pair.of("####", "Header 4"));
+
+        MarkdownHeaderTextSplitter markdownSplitter = new MarkdownHeaderTextSplitter(headersToSplitOn);
+        List<Document> output = markdownSplitter.splitText(markdownDocument);
+
+        List<Document> expectedOutput = List.of(
+                new Document(
+                        "Hi this is Jim  \nHi this is Joe",
+                        Map.of("Header 1", "Foo", "Header 2", "Bar")),
+                new Document(
+                        "Hi this is Lance",
+                        Map.of("Header 1", "Foo", "Header 2", "Bar", "Header 3", "Boo")),
+                new Document(
+                        "Hi this is John",
+                        Map.of("Header 1", "Foo", "Header 2", "Bar", "Header 3", "Boo", "Header 4", "Bim")),
+                new Document(
+                        "Hi this is Molly",
+                        Map.of("Header 1", "Foo", "Header 2", "Baz")));
+        assertEquals(expectedOutput, output);
+    }
+}
\ No newline at end of file