Skip to content
This repository has been archived by the owner on Jan 2, 2025. It is now read-only.

feat: index and search documentation #978

Merged
merged 42 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
bfa1eb1
feat: index and search documentation
oppiliappan Sep 20, 2023
115842a
integrate docs into studios
oppiliappan Sep 22, 2023
96e3424
rework error handling, add verify endpoint
oppiliappan Sep 25, 2023
63bae3d
introduce two-step search
oppiliappan Sep 26, 2023
6e143c7
interleave scraping and embedding
oppiliappan Sep 29, 2023
a2be4b1
add sse to sync and resync endpoints
oppiliappan Oct 3, 2023
a1a1845
continue crawling even if article cannot be parsed
oppiliappan Oct 4, 2023
e73df9a
introduce `list_with_id` to list pages in a provider
oppiliappan Oct 5, 2023
2df7b0e
return placeholder for page title
oppiliappan Oct 5, 2023
aff57ff
restore search endpoint
oppiliappan Oct 5, 2023
7b83c04
add metadata to all scraped docs
oppiliappan Oct 5, 2023
acf5b95
fix total token counts
oppiliappan Oct 5, 2023
e30e9a3
bug fixes & endpoint improvements
oppiliappan Oct 6, 2023
6738a8a
include index status in response, minor fixes
oppiliappan Oct 9, 2023
b3a6fe5
assortment of fixes
oppiliappan Oct 10, 2023
15c4bd7
extract language name class hierarchy for code
oppiliappan Oct 11, 2023
e8e4586
implement content-addressed scheme in qdrant; bug fixes
oppiliappan Oct 11, 2023
a791073
handle redirects gracefully
oppiliappan Oct 12, 2023
5e55654
introduce tantivy index
oppiliappan Oct 26, 2023
f1a982c
clean up search, fix redir bug
oppiliappan Oct 30, 2023
62bdbad
rip out qdrant
oppiliappan Nov 2, 2023
8603b52
assortment of fixes
oppiliappan Nov 3, 2023
1db820d
assortment of bug fixes, unabridged edition
oppiliappan Nov 6, 2023
bdf5ec7
remove chunking logic
oppiliappan Nov 6, 2023
4ea529e
rework log levels to work well with sentry
oppiliappan Nov 6, 2023
ac26035
lower log level to trace
oppiliappan Nov 6, 2023
81d01f5
clippy
oppiliappan Nov 6, 2023
e46bb4e
fix bugs arising from indexing momentjs
oppiliappan Nov 6, 2023
d097eae
add analytics events to /sync
oppiliappan Nov 6, 2023
036b9a1
fix studios
oppiliappan Nov 6, 2023
7ec7f9d
clippy
oppiliappan Nov 6, 2023
a3c9ab4
bug bug bug
oppiliappan Nov 6, 2023
65a2cfc
more url-ness
oppiliappan Nov 6, 2023
7005995
add absolute_url to doc-context-file
oppiliappan Nov 6, 2023
7fe7b3b
address review comments
oppiliappan Nov 7, 2023
45eb337
undo accidental doc test
oppiliappan Nov 7, 2023
f671dbe
Doc indexing FE (#1118)
anastasiya1155 Nov 7, 2023
e79c09c
fix selecting section after search in doc modal
anastasiya1155 Nov 7, 2023
749b1db
address review comments
oppiliappan Nov 7, 2023
ffd5db7
attempt to fix duplication
oppiliappan Nov 7, 2023
f80d820
fix issue with python tutorial page
oppiliappan Nov 7, 2023
727e0b0
address clippy
oppiliappan Nov 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
integrate docs into studios
studio crud ops
---------------

- introduces a new `doc_context` to studio snapshots and studios
- `doc_context` may be populated in a similar fashion to `context`,
  through the `patch` method, sample request:

    http PATCH :7878/api/studio/1
    {
        "doc_context": [
            {
                "doc_id": 3,
                "doc_source": "https://docs.rs/qdrant-client/latest/qdrant_client/",
                "relative_url": "qdrant/struct.PayloadIncludeSelector.html",
                "ranges": [
                    "eaefb40a-13a3-4c2e-a0b3-4ffa3670bfa4",
                    "9b0f63bb-9ecd-46ee-8315-23065df418ce"
                ],
                "hidden": false
            }
        ]
    }
- the uuids in the `ranges` field correspond to the sections in a
  webpage
- to display a webpage with its active an inactive sections, use the
  `fetch` endpoint, which lists every section of the page in-order.
  among these, the active sections are those which are present in the
  studio context

token counting
--------------

- token counts for docs are calculated separately and added under the
  `doc_context` field
- token counts include the headers for each section as well
  • Loading branch information
oppiliappan committed Nov 7, 2023
commit 115842af083e7219cbb184be79481d1f9a9f76e7
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ CREATE TABLE docs (

modified_at DATETIME NOT NULL DEFAULT (datetime('now'))
);
ALTER TABLE studio_snapshots ADD COLUMN doc_context TEXT NOT NULL;
200 changes: 114 additions & 86 deletions server/bleep/sqlx-data.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,36 @@
},
"query": "SELECT user_id FROM templates WHERE id = ? AND (user_id = ? OR user_id IS NULL)"
},
"0814a29c70503ad8abb4894621394e2ce45f1244772ce30345279dbc104ea01f": {
"describe": {
"columns": [
{
"name": "messages",
"ordinal": 0,
"type_info": "Text"
},
{
"name": "context",
"ordinal": 1,
"type_info": "Text"
},
{
"name": "doc_context",
"ordinal": 2,
"type_info": "Text"
}
],
"nullable": [
false,
false,
false
],
"parameters": {
"Right": 1
}
},
"query": "SELECT messages, context, doc_context FROM studio_snapshots WHERE id = ?"
},
"081aa337d2c658e09c21c3f7ec2f5164296c66e78fad34d2e65b47a0de508aeb": {
"describe": {
"columns": [
Expand All @@ -78,15 +108,47 @@
},
"query": "SELECT context, messages FROM studio_snapshots WHERE id = ?"
},
"0c72c51f4a5b726f6f524fcb269f074550b1a3a25ed050fb2701f8bec02678d7": {
"0c06bc7f11f6782618297e540890725a1977b1ec6a80849cd28b7f07c1fd5bd4": {
"describe": {
"columns": [],
"nullable": [],
"columns": [
{
"name": "id!",
"ordinal": 0,
"type_info": "Int64"
},
{
"name": "modified_at",
"ordinal": 1,
"type_info": "Datetime"
},
{
"name": "context",
"ordinal": 2,
"type_info": "Text"
},
{
"name": "doc_context",
"ordinal": 3,
"type_info": "Text"
},
{
"name": "messages",
"ordinal": 4,
"type_info": "Text"
}
],
"nullable": [
true,
false,
false,
false,
false
],
"parameters": {
"Right": 3
"Right": 2
}
},
"query": "INSERT INTO studio_snapshots (studio_id, context, messages)\n VALUES (?, ?, ?)"
"query": "SELECT ss.id as 'id!', ss.modified_at, ss.context, ss.doc_context, ss.messages\n FROM studio_snapshots ss\n JOIN studios s ON s.id = ss.studio_id AND s.user_id = ?\n WHERE ss.studio_id = ?\n ORDER BY modified_at DESC"
},
"11f5e7122d047f87c398cf56470c284e2037203bc4d1506efc85e7431e2e2f5f": {
"describe": {
Expand Down Expand Up @@ -260,6 +322,16 @@
},
"query": "UPDATE docs SET modified_at = datetime('now') WHERE id = ?"
},
"4573aa5ae3c4778b61a41e8984bb07b49d93e431c3f8434b5302df1a7a81997c": {
"describe": {
"columns": [],
"nullable": [],
"parameters": {
"Right": 2
}
},
"query": "UPDATE studio_snapshots SET doc_context = ? WHERE id = ?"
},
"476c0b82963b9a2333edec797133770f32c8269a21a17d3165e7785f69e886ab": {
"describe": {
"columns": [
Expand Down Expand Up @@ -428,30 +500,6 @@
},
"query": "DELETE FROM studio_snapshots\n WHERE id IN (\n SELECT ss.id\n FROM studio_snapshots ss\n JOIN studios s ON s.id = ss.studio_id AND s.user_id = ?\n WHERE ss.id = ? AND ss.studio_id = ?\n )\n RETURNING id"
},
"671df14b7c9077b95e586690f8c6d3f2eeb0a3942d0b800f272b010fcd2ca97b": {
"describe": {
"columns": [
{
"name": "messages",
"ordinal": 0,
"type_info": "Text"
},
{
"name": "context",
"ordinal": 1,
"type_info": "Text"
}
],
"nullable": [
false,
false
],
"parameters": {
"Right": 1
}
},
"query": "SELECT messages, context FROM studio_snapshots WHERE id = ?"
},
"69c8b59ce4be3fc6edb58563bf69f55ea5dca4646b0ba05820e5d1b2b07c3c82": {
"describe": {
"columns": [
Expand Down Expand Up @@ -608,71 +656,93 @@
},
"query": "SELECT id FROM studios WHERE id = ? AND name IS NULL"
},
"7ca39c2d8aebd7ebe1dbda00b4fb5fce6f4ea17894ce64c3d9786585c61739f0": {
"881aa78dfa3cd1bc3aa7a6edb8281aec5a972c1f53607d25c4e1f6d03cd3faef": {
"describe": {
"columns": [],
"nullable": [],
"parameters": {
"Right": 3
}
},
"query": "INSERT INTO tutorial_questions (question, tag, repo_ref) VALUES (?, ?, ?)"
},
"8c70038e00fa4619a2d77cbf2de3084bafa99e19567cd3bb5cde55f56b5c0070": {
"describe": {
"columns": [
{
"name": "id!",
"name": "id",
"ordinal": 0,
"type_info": "Int64"
},
{
"name": "modified_at",
"name": "name",
"ordinal": 1,
"type_info": "Datetime"
"type_info": "Text"
},
{
"name": "context",
"ordinal": 2,
"type_info": "Text"
},
{
"name": "messages",
"name": "doc_context",
"ordinal": 3,
"type_info": "Text"
},
{
"name": "messages",
"ordinal": 4,
"type_info": "Text"
},
{
"name": "modified_at",
"ordinal": 5,
"type_info": "Datetime"
}
],
"nullable": [
false,
true,
false,
false,
false,
false
],
"parameters": {
"Right": 2
"Right": 3
}
},
"query": "SELECT ss.id as 'id!', ss.modified_at, ss.context, ss.messages\n FROM studio_snapshots ss\n JOIN studios s ON s.id = ss.studio_id AND s.user_id = ?\n WHERE ss.studio_id = ?\n ORDER BY modified_at DESC"
"query": "SELECT s.id, s.name, ss.context, ss.doc_context, ss.messages, ss.modified_at\n FROM studios s\n INNER JOIN studio_snapshots ss ON ss.id = ?\n WHERE s.id = ? AND s.user_id = ?"
},
"881aa78dfa3cd1bc3aa7a6edb8281aec5a972c1f53607d25c4e1f6d03cd3faef": {
"8f99eede8e6c1fb27acc2524c00cebbc2d4e73db8af05599521e3b00c621347f": {
"describe": {
"columns": [],
"nullable": [],
"parameters": {
"Right": 3
"Right": 2
}
},
"query": "INSERT INTO tutorial_questions (question, tag, repo_ref) VALUES (?, ?, ?)"
"query": "UPDATE studio_snapshots SET modified_at = ? WHERE id = ?"
},
"8f99eede8e6c1fb27acc2524c00cebbc2d4e73db8af05599521e3b00c621347f": {
"9146d9c8a7f17cc65c017cb364d1a853a9163b5ece336c0a6ef4e28e8df56a6b": {
"describe": {
"columns": [],
"nullable": [],
"parameters": {
"Right": 2
}
},
"query": "UPDATE studio_snapshots SET modified_at = ? WHERE id = ?"
"query": "UPDATE chunk_cache SET branches = ? WHERE chunk_hash = ?"
},
"9146d9c8a7f17cc65c017cb364d1a853a9163b5ece336c0a6ef4e28e8df56a6b": {
"940f2221bcffd98ced716442c4360353a6e2366c134c8d72283620db288e701c": {
"describe": {
"columns": [],
"nullable": [],
"parameters": {
"Right": 2
"Right": 4
}
},
"query": "UPDATE chunk_cache SET branches = ? WHERE chunk_hash = ?"
"query": "INSERT INTO studio_snapshots (studio_id, context, doc_context, messages)\n VALUES (?, ?, ?, ?)"
},
"9f862a56e79cc9ae6e9b896064a0057335b40225be0a8c8d29d9227de12ae364": {
"describe": {
Expand Down Expand Up @@ -712,48 +782,6 @@
},
"query": "DELETE FROM studios WHERE id = ? AND user_id = ? RETURNING id"
},
"a921e0e531e66f7d679434a6eac1f00f30b85105adc16e0ec04bb95fe54190a9": {
"describe": {
"columns": [
{
"name": "id",
"ordinal": 0,
"type_info": "Int64"
},
{
"name": "name",
"ordinal": 1,
"type_info": "Text"
},
{
"name": "context",
"ordinal": 2,
"type_info": "Text"
},
{
"name": "messages",
"ordinal": 3,
"type_info": "Text"
},
{
"name": "modified_at",
"ordinal": 4,
"type_info": "Datetime"
}
],
"nullable": [
false,
true,
false,
false,
false
],
"parameters": {
"Right": 3
}
},
"query": "SELECT s.id, s.name, ss.context, ss.messages, ss.modified_at\n FROM studios s\n INNER JOIN studio_snapshots ss ON ss.id = ?\n WHERE s.id = ? AND s.user_id = ?"
},
"abf57821a0ac6f855a9dc677de87beac319610add247dbff2f4ce9a2eec3ce2a": {
"describe": {
"columns": [
Expand Down
6 changes: 5 additions & 1 deletion server/bleep/src/indexes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,11 @@ pub struct Indexes {
}

impl Indexes {
pub fn new(config: &Configuration) -> Result<Self> {
pub async fn new(
config: &Configuration,
sql: crate::SqlDb,
semantic: crate::semantic::Semantic,
) -> Result<Self> {
Ok(Self {
repo: Indexer::create(
Repo::new(),
Expand Down
Loading