You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by se...@apache.org on 2021/12/16 13:58:47 UTC
[incubator-ponymail-foal] 01/02: stats is inefficient; it gets inaccessible mails
This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
commit 91f8ad856d0f77734f0072072a3fa0a4384c8e70
Author: Sebb <se...@apache.org>
AuthorDate: Thu Dec 16 13:58:07 2021 +0000
stats is inefficient; it gets inaccessible mails
This fixes #181
---
server/endpoints/stats.py | 7 +++++++
server/plugins/messages.py | 37 +++++++++++++++++++------------------
2 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/server/endpoints/stats.py b/server/endpoints/stats.py
index 75f46e3..01513ec 100644
--- a/server/endpoints/stats.py
+++ b/server/endpoints/stats.py
@@ -49,6 +49,13 @@ async def process(
except AssertionError as ae: # If defuzzer encounters internal errors, it will throw an AssertionError
return aiohttp.web.Response(headers={"content-type": "text/plain",}, status=500, text=str(ae))
+ # get a filter for use with get_activity_span (no date)
+ # It can also be used with dated queries
+ query_filter = await plugins.messages.get_accessible_filter(session, query_defuzzed_nodate)
+ if query_filter:
+ query_defuzzed['filter'] = query_filter
+ query_defuzzed_nodate['filter'] = query_filter
+
# since: check if there have been recent updates to the data
if 'since' in indata:
since = indata.get('since', None)
diff --git a/server/plugins/messages.py b/server/plugins/messages.py
index 781a8c2..46fd0f1 100644
--- a/server/plugins/messages.py
+++ b/server/plugins/messages.py
@@ -350,7 +350,6 @@ async def query_batch(
"""
assert session.database, DATABASE_NOT_CONNECTED
preserve_order = True if epoch_order == "asc" else False
- query_defuzzed = await filter_accessible(session, query_defuzzed)
es_query = {
"query": {"bool": query_defuzzed},
"sort": [{"epoch": {"order": epoch_order}}],
@@ -439,17 +438,16 @@ async def query(
async def wordcloud(session: plugins.session.SessionObject, query_defuzzed: dict) -> dict:
"""
Wordclouds via significant terms query in ES
+ The query must include a private mail filter if necessary
"""
wc = {}
try:
- # Copy the query and ensure we're only looking at public content
- wc_public_query = dict(query_defuzzed)
- wc_public_query["filter"] = [{"term": {"private": False}}]
+
assert session.database, DATABASE_NOT_CONNECTED
res = await session.database.search(
body={
"size": 0,
- "query": {"bool": wc_public_query},
+ "query": {"bool": query_defuzzed},
"aggregations": {
"cloud": {"significant_terms": {"field": "subject", "size": 10}}
},
@@ -463,17 +461,18 @@ async def wordcloud(session: plugins.session.SessionObject, query_defuzzed: dict
pass
return wc
-async def filter_accessible(session: plugins.session.SessionObject, query_defuzzed: dict) -> dict:
+async def get_accessible_filter(session: plugins.session.SessionObject, query_defuzzed: dict) -> typing.Optional[list]:
"""
- Update query to take account of private emails
- Reduces the need to filter out emails later
+ Return a filter to be applied to the query to exclude inaccessible mails.
+ If no filter is needed, return None
+ e.g.
+ query_filter = get_accessible_filter(session, query)
+ if query_filter:
+ query['filter'] = query_filter
"""
- query_copy = dict(query_defuzzed)
if not session.credentials:
# if no credentials, only need to search public mails
- query_copy["filter"] = [{"term": {"private": False}}]
- return query_copy
-
+ return [{"term": {"private": False}}]
# which private lists might be involved in the search?
fuzz_private_only = dict(query_defuzzed)
fuzz_private_only["filter"] = [{"term": {"private": True}}]
@@ -501,18 +500,20 @@ async def filter_accessible(session: plugins.session.SessionObject, query_defuzz
# If we can't access all private lists found, either only public emails or lists we can access.
if not private_lists_accessible: # No private lists accessible, just filter for public
- query_copy["filter"] = [{"term": {"private": False}}]
+ return [{"term": {"private": False}}]
elif private_lists_found != private_lists_accessible: # Some private lists, search for public OR those..
- query_copy["filter"] = [
+ return [
{"bool": {"should": [{"term": {"private": False}}, {"terms": {"list_raw": private_lists_accessible}}]}}
]
- return query_copy
+ return None
-async def get_activity_span(session: plugins.session.SessionObject, query_defuzzed: dict) -> typing.Tuple[datetime.datetime, datetime.datetime, dict]:
- """ Fetches the activity span of a search as well as active months within that span """
- query_defuzzed = await filter_accessible(session, query_defuzzed)
+async def get_activity_span(session: plugins.session.SessionObject, query_defuzzed: dict) -> typing.Tuple[datetime.datetime, datetime.datetime, dict]:
+ """
+ Fetches the activity span of a search as well as active months within that span
+ The query must include a private filter if necessary
+ """
# Get oldest and youngest doc in single scan, as well as a monthly histogram
assert session.database, DATABASE_NOT_CONNECTED