Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

doc chat content fields #343

Merged
merged 3 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
rename examples; content fields check presence
  • Loading branch information
pchalasani committed Jan 5, 2024
commit 0bf641d48ad4cd34fef95c2a37e194688779ba5f
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def main(

cfg = DocChatAgentConfig(
vecdb=ldb_cfg,
add_fields_to_content=["state", "year", "month", "assignee", "size"],
)
agent = LanceDocChatAgent(cfg)
repo = Prompt.ask(
Expand All @@ -73,7 +74,6 @@ def main(
issues = repo_loader.get_issues(k=int(n_issues))
issue_dicts = [iss.dict() for iss in issues]
df = pd.DataFrame(issue_dicts)

metadata_cols = []
agent.ingest_dataframe(df, content="text", metadata=metadata_cols)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def main(
)
cfg = DocChatAgentConfig(
vecdb=ldb_cfg,
add_fields_to_content=["movie", "genre", "certificate", "stars", "rating"],
)
agent = LanceDocChatAgent(cfg)

Expand Down
18 changes: 11 additions & 7 deletions langroid/agent/special/doc_chat_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,13 +261,17 @@ def ingest_docs(self, docs: List[Document], split: bool = True) -> int:
# Note we need to do this at stage so that the embeddings
# are computed on the full content with these additional fields.
if len(self.config.add_fields_to_content) > 0:
for d in docs:
key_vals = extract_fields(d, self.config.add_fields_to_content)
d.content = (
",".join(f"{k}={v}" for k, v in key_vals.items())
+ ",content="
+ d.content
)
fields = [
f for f in extract_fields(docs[0], self.config.add_fields_to_content)
]
if len(fields) > 0:
for d in docs:
key_vals = extract_fields(d, fields)
d.content = (
",".join(f"{k}={v}" for k, v in key_vals.items())
+ ",content="
+ d.content
)
# add embeddings in batches, to stay under limit of embeddings API
batches = list(batched(docs, self.config.embed_batch_size))
for batch in batches:
Expand Down
12 changes: 10 additions & 2 deletions langroid/agent/special/lance_doc_chat_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,14 @@ def _get_clean_vecdb_schema(self) -> str:
self.vecdb.schema,
excludes=["id", "vector"],
)
return json.dumps(schema_dict, indent=4)
schema = json.dumps(schema_dict, indent=4)
if len(fields := self.config.add_fields_to_content) > 0:
schema += f"""
Additional fields added to `content` as key=value pairs:
NOTE That CAN Help with matching queries!
{fields}
"""
return schema

def query_plan(self, msg: QueryPlanTool) -> str:
"""
Expand Down Expand Up @@ -105,7 +112,8 @@ def ingest_dataframe(
# This helps retrieval for table-like data.
# Note we need to do this at stage so that the embeddings
# are computed on the full content with these additional fields.
if len(fields := self.config.add_fields_to_content) > 0:
fields = [f for f in self.config.add_fields_to_content if f in df.columns]
if len(fields) > 0:
df[content] = df.apply(
lambda row: (",".join(f"{f}={row[f]}" for f in fields))
+ ", content="
Expand Down
Loading