You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ponymail.apache.org by se...@apache.org on 2018/10/09 23:20:44 UTC
[incubator-ponymail] branch master updated: Bug: cannot download
more than 10K mails to a mbox file
This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
The following commit(s) were added to refs/heads/master by this push:
new e6e5d80 Bug: cannot download more than 10K mails to a mbox file
e6e5d80 is described below
commit e6e5d80caa509a803e91488b52c2aced87c97c9f
Author: Sebb <se...@apache.org>
AuthorDate: Wed Oct 10 00:20:42 2018 +0100
Bug: cannot download more than 10K mails to a mbox file
This fixes #475
---
CHANGELOG.md | 1 +
site/api/mbox.lua | 166 +++++++++++++++++++++++++++++++++++-------------------
2 files changed, 110 insertions(+), 57 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 658d174..02860be 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
## Changes in 0.11-SNAPSHOT
+- Bug: cannot download more than 10K mails to a mbox file (#475)
- Bug: no need to sort after scroll (#477)
- Enh: Ensure non-printable chars are not lost in source and mbox output (#476)
- Enh: display buttons even if no mails are found in a month (#470)
diff --git a/site/api/mbox.lua b/site/api/mbox.lua
index 780f242..2e94f42 100644
--- a/site/api/mbox.lua
+++ b/site/api/mbox.lua
@@ -53,6 +53,37 @@ local function getFromLine(r, source)
return "From " .. replyTo .. " " .. timeStamp
end
+local function writeMbox(r, docs)
+ -- for each email, get the actual source of it to plop into the mbox file
+ for k, v in pairs(docs.hits.hits) do
+ v = v._source
+ local doc = elastic.get('mbox_source', v.mid)
+ if doc and doc.source then
+ local checkFirst -- should we check the first line?
+ if not doc.source:match('^From ') then -- only add the header if there is none
+ r:puts(getFromLine(r, doc.source))
+ r:puts("\n")
+ checkFirst=true
+ else
+ checkFirst=false
+ end
+
+ -- pick out individual lines (including last which may not have EOL)
+ -- it's tricky to add the prefix to the output unless the From is at the start of a line
+ -- so it's easier to just skip the first match if necessary
+ for line in doc.source:gmatch("[^\r\n]*\r?\n?") do
+ -- check if 'From ' needs to be escaped
+ if checkFirst and line:match("^From ") then r:puts(">") end
+ checkFirst=true
+ -- TODO consider whether to optionally prefix '>From ', '^>>From ' etc.
+ -- If so, just change the RE to "^>*From "
+ r:write(line) -- original line
+ end
+ r:puts("\n")
+ end
+ end
+end
+
function handle(r)
cross.contentType(r, "application/mbox")
local get = r:parseargs()
@@ -77,75 +108,96 @@ function handle(r)
if r.headers_out then
r.headers_out['Content-Disposition'] = ("attachment; filename=%s_%04d-%02d.mbox"):format(flid,y,m)
end
-
- -- fetch all results from the list (up to 10k results), make sure to get the 'private' element
- local docs = elastic.raw {
- _source = {'mid','private'},
+
+ local DATERANGE = {
+ range = {
+ date = {
+ gte = ("%04d/%02d/%02d 00:00:00"):format(y,m,1),
+ lte = ("%04d/%02d/%02d 23:59:59"):format(y,m,d)
+ }
+ }
+ }
+
+ local LIST = {
+ term = {
+ list_raw = lid
+ }
+ }
+
+ -- Pre-process the list to find its size and whether there are any private mails
+ local squery = {
query = {
bool = {
must = {
- {
- range = {
- date = {
- gte = ("%04d/%02d/%02d 00:00:00"):format(y,m,1),
- lte = ("%04d/%02d/%02d 23:59:59"):format(y,m,d)
- }
- }
- },
- {
- term = {
- list_raw = lid
- }
- }
+ DATERANGE,
+ LIST
}
}
},
- sort = {
+ size = 0, -- no data wanted this time
+ aggs = {
+ privacy = {
+ terms = {
+ field = "private"
+ }
+ }
+ }
+ }
+
+ -- find list details
+ local docs = elastic.raw(squery)
+ local total_docs = docs.hits.total
+
+ local fetchPrivate = false -- should we try to fetch private messages?
+ for _, privacy in pairs(docs.aggregations.privacy.buckets) do
+ -- do we have a private message?
+ if privacy.key_as_string == "true" and privacy.doc_count > 0 then
+ -- if so, are we allowed access?
+ fetchPrivate = aaa.canAccessList(r, lid, user.get(r))
+ break
+ end
+ end
+
+ -- Now set up the data query
+ local MUST
+ if fetchPrivate then
+ MUST = {
+ DATERANGE,
+ LIST
+ }
+ else -- either there are no private messages or we don't have access
+ MUST = {
+ DATERANGE,
+ LIST,
{
- epoch = {
- order = "asc"
+ term = {
+ private = false
}
- }
+ }
+ }
+ end
+
+ -- create the actual query
+ local squery = {
+ _source = {'mid'},
+ query = {
+ bool = {
+ must = MUST
+ }
},
- size = 10000
+ size = elastic.MAX_RESULT_WINDOW
}
- local account = user.get(r)
- local listAccessible = nil -- not yet initialised
- -- for each email, get the actual source of it to plop into the mbox file
- for k, v in pairs(docs.hits.hits) do
- v = v._source
- -- aaa.rights() can be expensive, so only do it once per download
- if v.private and listAccessible == nil then
- -- we are dealing with a single list here so only need to check once
- listAccessible = aaa.canAccessList(r, lid, account)
- end
- if listAccessible or not v.private then
- local doc = elastic.get('mbox_source', v.mid)
- if doc and doc.source then
- local checkFirst -- should we check the first line?
- if not doc.source:match('^From ') then -- only add the header if there is none
- r:puts(getFromLine(r, doc.source))
- r:puts("\n")
- checkFirst=true
- else
- checkFirst=false
- end
-
- -- pick out individual lines (including last which may not have EOL)
- -- it's tricky to add the prefix to the output unless the From is at the start of a line
- -- so it's easier to just skip the first match if necessary
- for line in doc.source:gmatch("[^\r\n]*\r?\n?") do
- -- check if 'From ' needs to be escaped
- if checkFirst and line:match("^From ") then r:puts(">") end
- checkFirst=true
- -- TODO consider whether to optionally prefix '>From ', '^>>From ' etc.
- -- If so, just change the RE to "^>*From "
- r:write(line) -- original line
- end
- r:puts("\n")
- end
+ if total_docs > elastic.MAX_RESULT_WINDOW then
+ local docs, sid = elastic.scroll(squery)
+ while docs and docs.hits and docs.hits.hits and #docs.hits.hits > 0 do -- scroll as long as we get new results
+ writeMbox(r, docs)
+ docs, sid = elastic.scroll(sid)
end
+ elastic.clear_scroll(sid) -- we're done with the sid, release it
+ else
+ local docs = elastic.raw(squery)
+ writeMbox(r, docs)
end
else
cross.contentType(r, "text/plain")