forked from HamaWhiteGG/langchain-java
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3b6587b
commit 0d048e9
Showing
4 changed files
with
443 additions
and
0 deletions.
There are no files selected for viewing
47 changes: 47 additions & 0 deletions
47
langchain-core/src/main/java/com/hw/langchain/text/splitter/HeaderType.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.hw.langchain.text.splitter; | ||
|
||
import lombok.Data; | ||
|
||
/** | ||
* @author HamaWhite | ||
*/ | ||
@Data | ||
public class HeaderType { | ||
|
||
private int level; | ||
|
||
private String name; | ||
|
||
private String data; | ||
|
||
/** | ||
* Create a new HeaderType. | ||
* | ||
* @param level Header level | ||
* @param name Header name | ||
* @param data Header data | ||
*/ | ||
public HeaderType(int level, String name, String data) { | ||
this.level = level; | ||
this.name = name; | ||
this.data = data; | ||
} | ||
} |
47 changes: 47 additions & 0 deletions
47
langchain-core/src/main/java/com/hw/langchain/text/splitter/LineType.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.hw.langchain.text.splitter; | ||
|
||
import lombok.Data; | ||
|
||
import java.util.Map; | ||
|
||
/** | ||
* Line type as typed dict. | ||
* | ||
* @author HamaWhite | ||
*/ | ||
@Data | ||
public class LineType { | ||
|
||
private String content; | ||
|
||
private Map<String, Object> metadata; | ||
|
||
/** | ||
* Create a new LineType. | ||
* | ||
* @param content Line content | ||
* @param metadata Line metadata | ||
*/ | ||
public LineType(String content, Map<String, Object> metadata) { | ||
this.content = content; | ||
this.metadata = metadata; | ||
} | ||
} |
182 changes: 182 additions & 0 deletions
182
langchain-core/src/main/java/com/hw/langchain/text/splitter/MarkdownHeaderTextSplitter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.hw.langchain.text.splitter; | ||
|
||
import com.google.common.collect.Maps; | ||
import com.hw.langchain.schema.Document; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.commons.lang3.tuple.Pair; | ||
|
||
import java.util.*; | ||
|
||
/** | ||
* Implementation of splitting markdown files based on specified headers. | ||
* | ||
* @author HamaWhite | ||
*/ | ||
public class MarkdownHeaderTextSplitter { | ||
|
||
/** | ||
* Headers we want to track | ||
*/ | ||
private final List<Pair<String, String>> headersToSplitOn; | ||
|
||
/** | ||
* Return each line w/ associated headers | ||
*/ | ||
private final boolean returnEachLine; | ||
|
||
public MarkdownHeaderTextSplitter(List<Pair<String, String>> headersToSplitOn) { | ||
this(headersToSplitOn, false); | ||
} | ||
|
||
public MarkdownHeaderTextSplitter(List<Pair<String, String>> headersToSplitOn, boolean returnEachLine) { | ||
// Output line-by-line or aggregated into chunks w/ common headers | ||
this.returnEachLine = returnEachLine; | ||
|
||
// Given the headers we want to split on, (e.g., "#, ##, etc") order by length | ||
this.headersToSplitOn = headersToSplitOn.stream() | ||
.sorted(Comparator.<Pair<String, String>>comparingInt(e -> e.getKey().length()).reversed()) | ||
.toList(); | ||
} | ||
|
||
/** | ||
* Combine lines with common metadata into chunks. | ||
* | ||
* @param lines Line of text / associated header metadata | ||
* @return List of Document chunks | ||
*/ | ||
public List<Document> aggregateLinesToChunks(List<LineType> lines) { | ||
List<LineType> aggregatedChunks = new ArrayList<>(); | ||
|
||
for (LineType line : lines) { | ||
if (!aggregatedChunks.isEmpty() | ||
&& (aggregatedChunks.get(aggregatedChunks.size() - 1).getMetadata().equals(line.getMetadata()))) { | ||
// If the last line in the aggregated list has the same metadata as the current line, | ||
// append the current content to the last line's content | ||
LineType lastChunk = aggregatedChunks.get(aggregatedChunks.size() - 1); | ||
lastChunk.setContent(lastChunk.getContent() + " \n" + line.getContent()); | ||
} else { | ||
// Otherwise, append the current line to the aggregated list | ||
aggregatedChunks.add(line); | ||
} | ||
} | ||
return aggregatedChunks.stream() | ||
.map(chunk -> new Document(chunk.getContent(), chunk.getMetadata())) | ||
.toList(); | ||
} | ||
|
||
/** | ||
* Split markdown file. | ||
* | ||
* @param text Markdown file | ||
* @return List of Document chunks | ||
*/ | ||
public List<Document> splitText(String text) { | ||
List<LineType> linesWithMetadata = new ArrayList<>(); | ||
// Content and metadata of the chunk currently being processed | ||
List<String> currentContent = new ArrayList<>(); | ||
Map<String, Object> currentMetadata = Maps.newHashMap(); | ||
// Keep track of the nested header structure | ||
List<HeaderType> headerStack = new ArrayList<>(); | ||
Map<String, String> initialMetadata = Maps.newHashMap(); | ||
|
||
// Split the input text by newline character ("\n"). | ||
String[] lines = text.split("\n"); | ||
for (String line : lines) { | ||
String strippedLine = line.strip(); | ||
// Check each line against each of the header types (e.g., #, ##) | ||
boolean foundHeader = processLine(strippedLine, linesWithMetadata, currentContent, currentMetadata, | ||
headerStack, initialMetadata); | ||
|
||
if (!foundHeader && !strippedLine.isEmpty()) { | ||
currentContent.add(strippedLine); | ||
} else if (!currentContent.isEmpty()) { | ||
linesWithMetadata.add(new LineType(String.join("\n", currentContent), new HashMap<>(currentMetadata))); | ||
currentContent.clear(); | ||
} | ||
currentMetadata = new HashMap<>(initialMetadata); | ||
} | ||
|
||
return processOutput(linesWithMetadata, currentContent, currentMetadata); | ||
} | ||
|
||
private boolean processLine(String strippedLine, List<LineType> linesWithMetadata, List<String> currentContent, | ||
Map<String, Object> currentMetadata, List<HeaderType> headerStack, Map<String, String> initialMetadata) { | ||
for (Pair<String, String> pair : headersToSplitOn) { | ||
String sep = pair.getLeft(); | ||
String name = pair.getValue(); | ||
if (isHeaderToSplitOn(strippedLine, sep)) { | ||
// Ensure we are tracking the header as metadata | ||
if (name != null) { | ||
// Get the current header level | ||
int currentHeaderLevel = StringUtils.countMatches(sep, "#"); | ||
// Pop out headers of lower or same level from the stack | ||
while (!headerStack.isEmpty() | ||
&& headerStack.get(headerStack.size() - 1).getLevel() >= currentHeaderLevel) { | ||
// We have encountered a new header at the same or higher level | ||
HeaderType poppedHeader = headerStack.remove(headerStack.size() - 1); | ||
// Clear the metadata for the popped header in initialMetadata | ||
initialMetadata.remove(poppedHeader.getName()); | ||
} | ||
// Push the current header to the stack | ||
HeaderType header = | ||
new HeaderType(currentHeaderLevel, name, strippedLine.substring(sep.length()).strip()); | ||
headerStack.add(header); | ||
// Update initialMetadata with the current header | ||
initialMetadata.put(name, header.getData()); | ||
} | ||
// Add the previous line to the linesWithMetadata only if currentContent is not empty | ||
if (!currentContent.isEmpty()) { | ||
linesWithMetadata | ||
.add(new LineType(String.join("\n", currentContent), new HashMap<>(currentMetadata))); | ||
currentContent.clear(); | ||
} | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* Check if line starts with a header that we intend to split on. | ||
* Header with no text OR header is followed by space Both are valid conditions that sep is being used a header. | ||
*/ | ||
private boolean isHeaderToSplitOn(String strippedLine, String sep) { | ||
return strippedLine.startsWith(sep) && | ||
(strippedLine.length() == sep.length() || strippedLine.charAt(sep.length()) == ' '); | ||
} | ||
|
||
private List<Document> processOutput(List<LineType> linesWithMetadata, List<String> currentContent, | ||
Map<String, Object> currentMetadata) { | ||
if (!currentContent.isEmpty()) { | ||
linesWithMetadata.add(new LineType(String.join("\n", currentContent), currentMetadata)); | ||
} | ||
// linesWithMetadata has each line with associated header metadata aggregate these into chunks based on common | ||
// metadata | ||
if (!returnEachLine) { | ||
return aggregateLinesToChunks(linesWithMetadata); | ||
} else { | ||
return linesWithMetadata.stream() | ||
.map(chunk -> new Document(chunk.getContent(), chunk.getMetadata())) | ||
.toList(); | ||
} | ||
} | ||
} |
Oops, something went wrong.