Improved logging and added comments

onyx-dot-app · Weves · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
commit 590da9ae7ce93dc70af7ae1fd6b4058a01bd7b62
diff --git a/backend/ee/danswer/external_permissions/confluence/doc_sync.py b/backend/ee/danswer/external_permissions/confluence/doc_sync.py
@@ -24,6 +24,57 @@
 _REQUEST_PAGINATION_LIMIT = 100
 
 
+def _extract_user_email(subjects: dict[str, Any]) -> str | None:
+    # If the subject is a user, then return the user's email
+    user = subjects.get("user", {})
+    result = user.get("results", [{}])[0]
+    return result.get("email")
+
+
+def _extract_group_name(subjects: dict[str, Any]) -> str | None:
+    # If the subject is a group, then return the group's name
+    group = subjects.get("group", {})
+    result = group.get("results", [{}])[0]
+    return result.get("name")
+
+
+def _is_public_read_permission(permission: dict[str, Any]) -> bool:
+    # If the permission is a public read permission, then return True
+    operation = permission.get("operation", {})
+    operation_value = operation.get("operation")
+    anonymous_access = permission.get("anonymousAccess", False)
+    return operation_value == "read" and anonymous_access
+
+
+def _extract_read_access_restrictions(
+    restrictions: dict[str, Any]
+) -> tuple[list[str], list[str]]:
+    """
+    WARNING: This function includes no paginated retrieval. So if a page is private
+    within the space and has over 200 users or over 200 groups with explicitly read
+    access, this function will leave out some users or groups.
+    200 is a large amount so it is unlikely, but just be aware.
+    """
+    read_access = restrictions.get("read", {})
+    read_access_restrictions = read_access.get("restrictions", {})
+
+    # Extract the users with read access
+    read_access_user = read_access_restrictions.get("user", {})
+    read_access_user_jsons = read_access_user.get("results", [])
+    read_access_user_emails = [
+        user["email"] for user in read_access_user_jsons if user.get("email")
+    ]
+
+    # Extract the groups with read access
+    read_access_group = read_access_restrictions.get("group", {})
+    read_access_group_jsons = read_access_group.get("results", [])
+    read_access_group_names = [
+        group["name"] for group in read_access_group_jsons if group.get("name")
+    ]
+
+    return read_access_user_emails, read_access_group_names
+
+
 def _get_space_permissions(
     db_session: Session,
     confluence_client: Confluence,
@@ -33,27 +84,29 @@ def _get_space_permissions(
         confluence_client.get_space_permissions
     )
 
-    space_permissions = get_space_permissions(space_id).get("permissions", [])
+    space_permissions_result = get_space_permissions(space_id)
+    logger.debug(f"space_permissions_result: {space_permissions_result}")
+
+    space_permissions = space_permissions_result.get("permissions", [])
     user_emails = set()
     # Confluence enforces that group names are unique
     group_names = set()
     is_externally_public = False
     for permission in space_permissions:
-        subs = permission.get("subjects")
-        if subs:
+        subjects = permission.get("subjects")
+        if subjects:
             # If there are subjects, then there are explicit users or groups with access
-            if email := subs.get("user", {}).get("results", [{}])[0].get("email"):
+            if email := _extract_user_email(subjects):
                 user_emails.add(email)
-            if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"):
+            if group_name := _extract_group_name(subjects):
                 group_names.add(group_name)
         else:
             # If there are no subjects, then the permission is for everyone
-            if permission.get("operation", {}).get(
-                "operation"
-            ) == "read" and permission.get("anonymousAccess", False):
+            if _is_public_read_permission(permission):
                 # If the permission specifies read access for anonymous users, then
                 # the space is publicly accessible
                 is_externally_public = True
+
     batch_add_non_web_user_if_not_exists__no_commit(
         db_session=db_session, emails=list(user_emails)
     )
@@ -64,42 +117,30 @@ def _get_space_permissions(
     )
 
 
-def _get_restrictions_for_page(
+def _get_page_specific_restrictions(
     db_session: Session,
     page: dict[str, Any],
-    space_permissions: ExternalAccess,
-) -> ExternalAccess:
-    """
-    WARNING: This function includes no pagination. So if a page is private within
-    the space and has over 200 users or over 200 groups with explicitly read access,
-    this function will leave out some users or groups.
-    200 is a large amount so it is unlikely, but just be aware.
-    """
-    restrictions_json = page.get("restrictions", {})
-    read_access_dict = restrictions_json.get("read", {}).get("restrictions", {})
-
-    read_access_user_jsons = read_access_dict.get("user", {}).get("results", [])
-    read_access_group_jsons = read_access_dict.get("group", {}).get("results", [])
-
-    is_space_public = read_access_user_jsons == [] and read_access_group_jsons == []
+) -> ExternalAccess | None:
+    (
+        read_access_user_emails,
+        read_access_group_names,
+    ) = _extract_read_access_restrictions(restrictions=page.get("restrictions", {}))
+
+    # If there are no restrictions found, then the page
+    # inherits the space's restrictions so return None
+    is_space_public = read_access_user_emails == [] and read_access_group_names == []
+    if is_space_public:
+        return None
 
-    if not is_space_public:
-        read_access_user_emails = [
-            user["email"] for user in read_access_user_jsons if user.get("email")
-        ]
-        read_access_groups = [group["name"] for group in read_access_group_jsons]
-        batch_add_non_web_user_if_not_exists__no_commit(
-            db_session=db_session, emails=list(read_access_user_emails)
-        )
-        external_access = ExternalAccess(
-            external_user_emails=set(read_access_user_emails),
-            external_user_group_ids=set(read_access_groups),
-            is_public=False,
-        )
-    else:
-        external_access = space_permissions
-
-    return external_access
+    batch_add_non_web_user_if_not_exists__no_commit(
+        db_session=db_session, emails=list(read_access_user_emails)
+    )
+    return ExternalAccess(
+        external_user_emails=set(read_access_user_emails),
+        external_user_group_ids=set(read_access_group_names),
+        # there is no way for a page to be individually public if the space isn't public
+        is_public=False,
+    )
 
 
 def _fetch_attachment_document_ids_for_page_paginated(
@@ -182,14 +223,13 @@ def _fetch_all_page_restrictions_for_space(
     db_session: Session,
     confluence_client: Confluence,
     space_id: str,
-    space_permissions: ExternalAccess,
-) -> dict[str, ExternalAccess]:
+) -> dict[str, ExternalAccess | None]:
     all_pages = _fetch_all_pages_paginated(
         confluence_client=confluence_client,
         space_id=space_id,
     )
 
-    document_restrictions: dict[str, ExternalAccess] = {}
+    document_restrictions: dict[str, ExternalAccess | None] = {}
     for page in all_pages:
         """
         This assigns the same permissions to all attachments of a page and
@@ -199,24 +239,32 @@ def _fetch_all_page_restrictions_for_space(
         may not be their own standalone documents. This is likely fine as we just upsert a
         document with just permissions.
         """
-        attachment_document_ids = [
+        document_ids = []
+
+        # Add the page's document id
+        document_ids.append(
             build_confluence_document_id(
                 base_url=confluence_client.url,
                 content_url=page["_links"]["webui"],
             )
-        ]
-        attachment_document_ids.extend(
+        )
+
+        # Add the page's attachments document ids
+        document_ids.extend(
             _fetch_attachment_document_ids_for_page_paginated(
                 confluence_client=confluence_client, page=page
             )
         )
-        page_permissions = _get_restrictions_for_page(
+
+        # Get the page's specific restrictions
+        page_permissions = _get_page_specific_restrictions(
             db_session=db_session,
             page=page,
-            space_permissions=space_permissions,
         )
-        for attachment_document_id in attachment_document_ids:
-            document_restrictions[attachment_document_id] = page_permissions
+
+        # Apply the page's specific restrictions to the page and its attachments
+        for document_id in document_ids:
+            document_restrictions[document_id] = page_permissions
 
     return document_restrictions
 
@@ -243,12 +291,15 @@ def confluence_doc_sync(
         db_session=db_session,
         confluence_client=confluence_client,
         space_id=cc_pair.connector.connector_specific_config["space"],
-        space_permissions=space_permissions,
     )
-    for doc_id, ext_access in fresh_doc_permissions.items():
+    for doc_id, page_specific_access in fresh_doc_permissions.items():
+        # If there are no page specific restrictions, then
+        # the page inherits the space's restrictions
+        page_access = page_specific_access or space_permissions
+
         upsert_document_external_perms__no_commit(
             db_session=db_session,
             doc_id=doc_id,
-            external_access=ext_access,
+            external_access=page_access,
             source_type=cc_pair.connector.source,
         )
diff --git a/backend/ee/danswer/external_permissions/permission_sync.py b/backend/ee/danswer/external_permissions/permission_sync.py
@@ -44,7 +44,7 @@ def run_external_group_permission_sync(
         # update postgres
         db_session.commit()
     except Exception as e:
-        logger.error(f"Error updating document index: {e}")
+        logger.exception(f"Error Syncing Group Permissions: {e}")
         db_session.rollback()
 
 
@@ -108,5 +108,5 @@ def run_external_doc_permission_sync(
         # update postgres
         db_session.commit()
     except Exception as e:
-        logger.error(f"Error Syncing Permissions: {e}")
+        logger.exception(f"Error Syncing Document Permissions: {e}")
         db_session.rollback()