Skip to content

Commit

Permalink
Add TODOs and minor style changes to web connector (onyx-dot-app#254)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Jul 29, 2023
1 parent 0d7d54f commit 17e2008
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 22 deletions.
17 changes: 10 additions & 7 deletions backend/danswer/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,16 @@
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
)
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES",
"sidebar,header,footer").split(",")
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS",
"nav,header,footer,meta,script,style,symbol,aside").split(",")
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False)
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False)
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False)
# TODO these should be available for frontend configuration, via advanced options expandable
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer"
).split(",")
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside"
).split(",")
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")

#####
# Query Configs
Expand Down
38 changes: 23 additions & 15 deletions backend/danswer/connectors/web/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,24 @@
import bs4
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader

from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session

from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \
WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
from oauthlib.oauth2 import BackendApplicationClient
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader
from requests_oauthlib import OAuth2Session # type:ignore

logger = setup_logger()

Expand Down Expand Up @@ -123,13 +123,21 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:

context = browser.new_context()

if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL:
if (
WEB_CONNECTOR_OAUTH_CLIENT_ID
and WEB_CONNECTOR_OAUTH_CLIENT_SECRET
and WEB_CONNECTOR_OAUTH_TOKEN_URL
):
client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID)
oauth = OAuth2Session(client=client)
token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET)
context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])})
token = oauth.fetch_token(
token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET,
)
context.set_extra_http_headers(
{"Authorization": "Bearer {}".format(token["access_token"])}
)

return playwright, context

Expand Down
2 changes: 2 additions & 0 deletions backend/requirements/default.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Mako==1.2.4
nltk==3.8.1
docx2txt==0.8
openai==0.27.6
oauthlib==3.2.2
playwright==1.32.1
psycopg2==2.9.6
psycopg2-binary==2.9.6
Expand All @@ -31,6 +32,7 @@ pytest-playwright==0.3.2
python-multipart==0.0.6
qdrant-client==1.2.0
requests==2.31.0
requests-oauthlib==1.3.1
retry==0.9.2
rfc3986==1.5.0
sentence-transformers==2.2.2
Expand Down
1 change: 1 addition & 0 deletions backend/requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pre-commit==3.2.2
reorder-python-imports==3.9.0
types-beautifulsoup4==4.12.0.3
types-html5lib==1.1.11.13
types-oauthlib==3.2.0.9
types-psycopg2==2.9.21.10
types-python-dateutil==2.8.19.13
types-regex==2023.3.23.1
Expand Down

0 comments on commit 17e2008

Please sign in to comment.