rename examples; content fields check presence

langroid · pchalasani · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
commit 0bf641d48ad4cd34fef95c2a37e194688779ba5f
diff --git a/examples/docqa/lance-filtered-gh-issues.py → examples/docqa/lance-rag-gh-issues.py b/examples/docqa/lance-filtered-gh-issues.py → examples/docqa/lance-rag-gh-issues.py
@@ -60,6 +60,7 @@ def main(
 
     cfg = DocChatAgentConfig(
         vecdb=ldb_cfg,
+        add_fields_to_content=["state", "year", "month", "assignee", "size"],
     )
     agent = LanceDocChatAgent(cfg)
     repo = Prompt.ask(
@@ -73,7 +74,6 @@ def main(
     issues = repo_loader.get_issues(k=int(n_issues))
     issue_dicts = [iss.dict() for iss in issues]
     df = pd.DataFrame(issue_dicts)
-
     metadata_cols = []
     agent.ingest_dataframe(df, content="text", metadata=metadata_cols)
 

diff --git a/examples/docqa/lance-filtered-movies.py → examples/docqa/lance-rag-movies.py b/examples/docqa/lance-filtered-movies.py → examples/docqa/lance-rag-movies.py
@@ -96,6 +96,7 @@ def main(
     )
     cfg = DocChatAgentConfig(
         vecdb=ldb_cfg,
+        add_fields_to_content=["movie", "genre", "certificate", "stars", "rating"],
     )
     agent = LanceDocChatAgent(cfg)
 

diff --git a/langroid/agent/special/doc_chat_agent.py b/langroid/agent/special/doc_chat_agent.py
@@ -261,13 +261,17 @@ def ingest_docs(self, docs: List[Document], split: bool = True) -> int:
         # Note we need to do this at stage so that the embeddings
         # are computed on the full content with these additional fields.
         if len(self.config.add_fields_to_content) > 0:
-            for d in docs:
-                key_vals = extract_fields(d, self.config.add_fields_to_content)
-                d.content = (
-                    ",".join(f"{k}={v}" for k, v in key_vals.items())
-                    + ",content="
-                    + d.content
-                )
+            fields = [
+                f for f in extract_fields(docs[0], self.config.add_fields_to_content)
+            ]
+            if len(fields) > 0:
+                for d in docs:
+                    key_vals = extract_fields(d, fields)
+                    d.content = (
+                        ",".join(f"{k}={v}" for k, v in key_vals.items())
+                        + ",content="
+                        + d.content
+                    )
         # add embeddings in batches, to stay under limit of embeddings API
         batches = list(batched(docs, self.config.embed_batch_size))
         for batch in batches:

diff --git a/langroid/agent/special/lance_doc_chat_agent.py b/langroid/agent/special/lance_doc_chat_agent.py
@@ -44,7 +44,14 @@ def _get_clean_vecdb_schema(self) -> str:
             self.vecdb.schema,
             excludes=["id", "vector"],
         )
-        return json.dumps(schema_dict, indent=4)
+        schema = json.dumps(schema_dict, indent=4)
+        if len(fields := self.config.add_fields_to_content) > 0:
+            schema += f"""
+            Additional fields added to `content` as key=value pairs:
+            NOTE That CAN Help with matching queries!
+            {fields}
+            """
+        return schema
 
     def query_plan(self, msg: QueryPlanTool) -> str:
         """
@@ -105,7 +112,8 @@ def ingest_dataframe(
         # This helps retrieval for table-like data.
         # Note we need to do this at stage so that the embeddings
         # are computed on the full content with these additional fields.
-        if len(fields := self.config.add_fields_to_content) > 0:
+        fields = [f for f in self.config.add_fields_to_content if f in df.columns]
+        if len(fields) > 0:
             df[content] = df.apply(
                 lambda row: (",".join(f"{f}={row[f]}" for f in fields))
                 + ", content="