Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved logging and added comments #2763

Merged
merged 3 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Improved logging and added comments
  • Loading branch information
hagen-danswer committed Oct 10, 2024
commit 590da9ae7ce93dc70af7ae1fd6b4058a01bd7b62
159 changes: 105 additions & 54 deletions backend/ee/danswer/external_permissions/confluence/doc_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,57 @@
_REQUEST_PAGINATION_LIMIT = 100


def _extract_user_email(subjects: dict[str, Any]) -> str | None:
# If the subject is a user, then return the user's email
user = subjects.get("user", {})
result = user.get("results", [{}])[0]
return result.get("email")


def _extract_group_name(subjects: dict[str, Any]) -> str | None:
# If the subject is a group, then return the group's name
group = subjects.get("group", {})
result = group.get("results", [{}])[0]
return result.get("name")


def _is_public_read_permission(permission: dict[str, Any]) -> bool:
# If the permission is a public read permission, then return True
operation = permission.get("operation", {})
operation_value = operation.get("operation")
anonymous_access = permission.get("anonymousAccess", False)
return operation_value == "read" and anonymous_access


def _extract_read_access_restrictions(
restrictions: dict[str, Any]
) -> tuple[list[str], list[str]]:
"""
WARNING: This function includes no paginated retrieval. So if a page is private
within the space and has over 200 users or over 200 groups with explicitly read
access, this function will leave out some users or groups.
200 is a large amount so it is unlikely, but just be aware.
"""
read_access = restrictions.get("read", {})
read_access_restrictions = read_access.get("restrictions", {})

# Extract the users with read access
read_access_user = read_access_restrictions.get("user", {})
read_access_user_jsons = read_access_user.get("results", [])
read_access_user_emails = [
user["email"] for user in read_access_user_jsons if user.get("email")
]

# Extract the groups with read access
read_access_group = read_access_restrictions.get("group", {})
read_access_group_jsons = read_access_group.get("results", [])
read_access_group_names = [
group["name"] for group in read_access_group_jsons if group.get("name")
]

return read_access_user_emails, read_access_group_names


def _get_space_permissions(
db_session: Session,
confluence_client: Confluence,
Expand All @@ -33,27 +84,29 @@ def _get_space_permissions(
confluence_client.get_space_permissions
)

space_permissions = get_space_permissions(space_id).get("permissions", [])
space_permissions_result = get_space_permissions(space_id)
logger.debug(f"space_permissions_result: {space_permissions_result}")

space_permissions = space_permissions_result.get("permissions", [])
user_emails = set()
# Confluence enforces that group names are unique
group_names = set()
is_externally_public = False
for permission in space_permissions:
subs = permission.get("subjects")
if subs:
subjects = permission.get("subjects")
if subjects:
# If there are subjects, then there are explicit users or groups with access
if email := subs.get("user", {}).get("results", [{}])[0].get("email"):
if email := _extract_user_email(subjects):
user_emails.add(email)
if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"):
if group_name := _extract_group_name(subjects):
group_names.add(group_name)
else:
# If there are no subjects, then the permission is for everyone
if permission.get("operation", {}).get(
"operation"
) == "read" and permission.get("anonymousAccess", False):
if _is_public_read_permission(permission):
# If the permission specifies read access for anonymous users, then
# the space is publicly accessible
is_externally_public = True

batch_add_non_web_user_if_not_exists__no_commit(
db_session=db_session, emails=list(user_emails)
)
Expand All @@ -64,42 +117,30 @@ def _get_space_permissions(
)


def _get_restrictions_for_page(
def _get_page_specific_restrictions(
db_session: Session,
page: dict[str, Any],
space_permissions: ExternalAccess,
) -> ExternalAccess:
"""
hagen-danswer marked this conversation as resolved.
Show resolved Hide resolved
WARNING: This function includes no pagination. So if a page is private within
the space and has over 200 users or over 200 groups with explicitly read access,
this function will leave out some users or groups.
200 is a large amount so it is unlikely, but just be aware.
"""
restrictions_json = page.get("restrictions", {})
read_access_dict = restrictions_json.get("read", {}).get("restrictions", {})

read_access_user_jsons = read_access_dict.get("user", {}).get("results", [])
read_access_group_jsons = read_access_dict.get("group", {}).get("results", [])

is_space_public = read_access_user_jsons == [] and read_access_group_jsons == []
) -> ExternalAccess | None:
(
read_access_user_emails,
read_access_group_names,
) = _extract_read_access_restrictions(restrictions=page.get("restrictions", {}))

# If there are no restrictions found, then the page
# inherits the space's restrictions so return None
is_space_public = read_access_user_emails == [] and read_access_group_names == []
if is_space_public:
return None

if not is_space_public:
read_access_user_emails = [
user["email"] for user in read_access_user_jsons if user.get("email")
]
read_access_groups = [group["name"] for group in read_access_group_jsons]
batch_add_non_web_user_if_not_exists__no_commit(
db_session=db_session, emails=list(read_access_user_emails)
)
external_access = ExternalAccess(
external_user_emails=set(read_access_user_emails),
external_user_group_ids=set(read_access_groups),
is_public=False,
)
else:
external_access = space_permissions

return external_access
batch_add_non_web_user_if_not_exists__no_commit(
db_session=db_session, emails=list(read_access_user_emails)
)
return ExternalAccess(
external_user_emails=set(read_access_user_emails),
external_user_group_ids=set(read_access_group_names),
# there is no way for a page to be individually public if the space isn't public
is_public=False,
)


def _fetch_attachment_document_ids_for_page_paginated(
Expand Down Expand Up @@ -182,14 +223,13 @@ def _fetch_all_page_restrictions_for_space(
db_session: Session,
confluence_client: Confluence,
space_id: str,
space_permissions: ExternalAccess,
) -> dict[str, ExternalAccess]:
) -> dict[str, ExternalAccess | None]:
all_pages = _fetch_all_pages_paginated(
confluence_client=confluence_client,
space_id=space_id,
)

document_restrictions: dict[str, ExternalAccess] = {}
document_restrictions: dict[str, ExternalAccess | None] = {}
for page in all_pages:
"""
This assigns the same permissions to all attachments of a page and
Expand All @@ -199,24 +239,32 @@ def _fetch_all_page_restrictions_for_space(
may not be their own standalone documents. This is likely fine as we just upsert a
document with just permissions.
"""
attachment_document_ids = [
document_ids = []

# Add the page's document id
document_ids.append(
build_confluence_document_id(
base_url=confluence_client.url,
content_url=page["_links"]["webui"],
)
]
attachment_document_ids.extend(
)

# Add the page's attachments document ids
document_ids.extend(
_fetch_attachment_document_ids_for_page_paginated(
confluence_client=confluence_client, page=page
)
)
page_permissions = _get_restrictions_for_page(

# Get the page's specific restrictions
page_permissions = _get_page_specific_restrictions(
db_session=db_session,
page=page,
space_permissions=space_permissions,
)
for attachment_document_id in attachment_document_ids:
document_restrictions[attachment_document_id] = page_permissions

# Apply the page's specific restrictions to the page and its attachments
for document_id in document_ids:
document_restrictions[document_id] = page_permissions

return document_restrictions

Expand All @@ -243,12 +291,15 @@ def confluence_doc_sync(
db_session=db_session,
confluence_client=confluence_client,
space_id=cc_pair.connector.connector_specific_config["space"],
space_permissions=space_permissions,
)
for doc_id, ext_access in fresh_doc_permissions.items():
for doc_id, page_specific_access in fresh_doc_permissions.items():
# If there are no page specific restrictions, then
# the page inherits the space's restrictions
page_access = page_specific_access or space_permissions

upsert_document_external_perms__no_commit(
db_session=db_session,
doc_id=doc_id,
external_access=ext_access,
external_access=page_access,
source_type=cc_pair.connector.source,
)
4 changes: 2 additions & 2 deletions backend/ee/danswer/external_permissions/permission_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def run_external_group_permission_sync(
# update postgres
db_session.commit()
except Exception as e:
logger.error(f"Error updating document index: {e}")
logger.exception(f"Error Syncing Group Permissions: {e}")
hagen-danswer marked this conversation as resolved.
Show resolved Hide resolved
db_session.rollback()


Expand Down Expand Up @@ -108,5 +108,5 @@ def run_external_doc_permission_sync(
# update postgres
db_session.commit()
except Exception as e:
logger.error(f"Error Syncing Permissions: {e}")
logger.exception(f"Error Syncing Document Permissions: {e}")
hagen-danswer marked this conversation as resolved.
Show resolved Hide resolved
db_session.rollback()
Loading