Skip to content

Commit

Permalink
add Pinecone (30%)
Browse files Browse the repository at this point in the history
  • Loading branch information
HamaWhiteGG committed Jun 26, 2023
1 parent 30e833a commit da761ae
Show file tree
Hide file tree
Showing 20 changed files with 1,679 additions and 2 deletions.
723 changes: 723 additions & 0 deletions docs/extras/modules/state_of_the_union.txt

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions langchain-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,21 @@
<artifactId>jython-standalone</artifactId>
</dependency>

<dependency>
<groupId>com.knuddels</groupId>
<artifactId>jtokkit</artifactId>
</dependency>

<dependency>
<groupId>com.googlecode.juniversalchardet</groupId>
<artifactId>juniversalchardet</artifactId>
</dependency>

<dependency>
<groupId>io.pinecone</groupId>
<artifactId>pinecone-client</artifactId>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ public class ChatOpenAI extends BaseChatModel {
protected Integer maxTokens;

/**
* Validate that api key exists in environment.
* Validate parameters and init client
*/
public ChatOpenAI init() {
openaiApiKey = getOrEnvOrDefault(openaiApiKey, "OPENAI_API_KEY");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hw.langchain.document.loaders.base;

import com.hw.langchain.schema.Document;

import java.util.List;

/**
* Interface for loading documents.
* <p>
* Implementations should implement the lazy-loading method using generators
* to avoid loading all documents into memory at once.
*
* @author HamaWhite
*/
public interface BaseLoader {

/**
* Load data into document objects.
*
* @return a List which is materialized in memory.
*/
List<Document> load();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hw.langchain.document.loaders.helpers;

import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.Getter;

import java.nio.charset.Charset;

/**
* @author HamaWhite
*/
@Getter
@AllArgsConstructor
@EqualsAndHashCode
public class FileEncoding {

private Charset encoding;

private int confidence;

private String language;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hw.langchain.document.loaders.helpers;

import org.python.icu.text.CharsetDetector;
import org.python.icu.text.CharsetMatch;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* @author HamaWhite
*/
public class Helpers {

/**
* Try to detect the file encoding.
*/
public static FileEncoding detectFileEncodings(String filePath) throws IOException {
Path path = Paths.get(filePath);
byte[] data = Files.readAllBytes(path);

CharsetDetector detector = new CharsetDetector();
detector.setText(data);
CharsetMatch match = detector.detect();

Charset charset = Charset.forName(match.getName());
return new FileEncoding(charset, match.getConfidence(), match.getLanguage());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hw.langchain.document.loaders.text;

import com.hw.langchain.document.loaders.base.BaseLoader;
import com.hw.langchain.document.loaders.helpers.FileEncoding;
import com.hw.langchain.exception.LangChainException;
import com.hw.langchain.schema.Document;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;

import static com.hw.langchain.document.loaders.helpers.Helpers.detectFileEncodings;

/**
* Load text files.
*
* @author HamaWhite
*/
public class TextLoader implements BaseLoader {

private static final Logger LOG = LoggerFactory.getLogger(TextLoader.class);

private final String filePath;

private final Charset encoding;

private final boolean autodetectEncoding;

public TextLoader(String filePath) {
this(filePath, Charset.defaultCharset(), false);
}

/**
* Load text files.
*
* @param filePath Path to the file to load.
* @param encoding File encoding to use. If `null`, the file will be loaded with the default system encoding.
* @param autodetectEncoding Whether to try to autodetect the file encoding if the specified encoding fails.
*/
public TextLoader(String filePath, Charset encoding, boolean autodetectEncoding) {
this.filePath = filePath;
this.encoding = encoding;
this.autodetectEncoding = autodetectEncoding;
}

/**
* Load from file path.
*/
@Override
public List<Document> load() {
String text;
try {
text = Files.readString(Path.of(filePath), encoding);
} catch (IOException e) {
if (autodetectEncoding) {
text = loadWithDetectedEncoding(filePath);
} else {
throw new LangChainException(errorMessage(filePath), e);
}
} catch (Exception e) {
throw new LangChainException(errorMessage(filePath), e);
}
Map<String, Object> metadata = Map.of("source", filePath);
return List.of(new Document(text, metadata));
}

private String loadWithDetectedEncoding(String filePath) {
try {
FileEncoding detected = detectFileEncodings(filePath);
LOG.debug("Trying encoding: {}", detected.getEncoding());
return Files.readString(Path.of(filePath), detected.getEncoding());
} catch (IOException e) {
throw new LangChainException(errorMessage(filePath), e);
}
}

private String errorMessage(String filePath) {
return "Error loading " + filePath;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hw.langchain.embeddings.base;

import java.util.List;

/**
* Interface for embedding models.
*
* @author HamaWhite
*/
public interface Embeddings {

/**
* Embed search docs.
*/
List<List<Float>> embedDocuments(List<String> texts);

/**
* Embed query text.
*/
List<Float> embedQuery(String text);
}
Loading

0 comments on commit da761ae

Please sign in to comment.