chore: Improve email parsing using email trimmer gem (#3611)
Email parsing using email_trimmer gem Fixes: #3539 , #2954, #3572
This commit is contained in:
31
app/presenters/html_parser.rb
Normal file
31
app/presenters/html_parser.rb
Normal file
@@ -0,0 +1,31 @@
|
||||
class HtmlParser
|
||||
def self.parse_reply(raw_body)
|
||||
new(raw_body).filtered_text
|
||||
end
|
||||
|
||||
attr_reader :raw_body
|
||||
|
||||
def initialize(raw_body)
|
||||
@raw_body = raw_body
|
||||
end
|
||||
|
||||
def document
|
||||
@document ||= Nokogiri::HTML(raw_body)
|
||||
end
|
||||
|
||||
def filter_replies!
|
||||
document.xpath('//blockquote').each { |n| n.replace('> ') }
|
||||
document.xpath('//table').each(&:remove)
|
||||
end
|
||||
|
||||
def filtered_html
|
||||
@filtered_html ||= begin
|
||||
filter_replies!
|
||||
document.inner_html
|
||||
end
|
||||
end
|
||||
|
||||
def filtered_text
|
||||
@filtered_text ||= Html2Text.convert(filtered_html)
|
||||
end
|
||||
end
|
||||
@@ -8,30 +8,48 @@ class MailPresenter < SimpleDelegator
|
||||
end
|
||||
|
||||
def subject
|
||||
encode_to_unicode(@mail.subject || '')
|
||||
encode_to_unicode(@mail.subject)
|
||||
end
|
||||
|
||||
def text_content
|
||||
@decoded_text_content ||= encode_to_unicode(text_part&.decoded || decoded_message || '')
|
||||
@decoded_text_content = select_body || ''
|
||||
encoding = @decoded_text_content.encoding
|
||||
|
||||
body = EmailReplyTrimmer.trim(@decoded_text_content)
|
||||
|
||||
return {} if @decoded_text_content.blank?
|
||||
|
||||
@text_content ||= {
|
||||
full: @decoded_text_content,
|
||||
reply: extract_reply(@decoded_text_content)[:reply],
|
||||
quoted: extract_reply(@decoded_text_content)[:quoted_text]
|
||||
full: select_body,
|
||||
reply: @decoded_text_content,
|
||||
quoted: body.force_encoding(encoding).encode('UTF-8')
|
||||
}
|
||||
end
|
||||
|
||||
def select_body
|
||||
message = mail.text_part || mail.html_part || mail
|
||||
decoded = encode_to_unicode(message.decoded)
|
||||
# Certain trigger phrases that means we didn't parse correctly
|
||||
return '' if %r{(Content-Type: multipart/alternative|text/plain)}.match?(decoded)
|
||||
|
||||
if (mail.content_type || '').include? 'text/html'
|
||||
::HtmlParser.parse_reply(decoded)
|
||||
else
|
||||
decoded
|
||||
end
|
||||
end
|
||||
|
||||
def html_content
|
||||
@decoded_html_content ||= encode_to_unicode(html_part&.decoded)
|
||||
@decoded_html_content = select_body || ''
|
||||
|
||||
return {} if @decoded_html_content.blank?
|
||||
|
||||
body = EmailReplyTrimmer.trim(@decoded_html_content)
|
||||
|
||||
@html_content ||= {
|
||||
full: @decoded_html_content,
|
||||
reply: extract_reply(@decoded_html_content)[:reply],
|
||||
quoted: extract_reply(@decoded_html_content)[:quoted_text]
|
||||
full: select_body,
|
||||
reply: @decoded_html_content,
|
||||
quoted: body
|
||||
}
|
||||
end
|
||||
|
||||
@@ -47,14 +65,6 @@ class MailPresenter < SimpleDelegator
|
||||
end
|
||||
end
|
||||
|
||||
def decoded_message
|
||||
if mail.multipart?
|
||||
return mail.text_part ? mail.text_part.decoded : nil
|
||||
end
|
||||
|
||||
mail.decoded
|
||||
end
|
||||
|
||||
def number_of_attachments
|
||||
mail.attachments.count
|
||||
end
|
||||
@@ -114,21 +124,8 @@ class MailPresenter < SimpleDelegator
|
||||
return str if current_encoding == 'UTF-8'
|
||||
|
||||
str.encode(current_encoding, 'UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
||||
end
|
||||
|
||||
def extract_reply(content)
|
||||
@regex_arr ||= quoted_text_regexes
|
||||
|
||||
content_length = content.length
|
||||
# calculates the matching regex closest to top of page
|
||||
index = @regex_arr.inject(content_length) do |min, regex|
|
||||
[(content.index(regex) || content_length), min].min
|
||||
end
|
||||
|
||||
{
|
||||
reply: content[0..(index - 1)].strip,
|
||||
quoted_text: content[index..].strip
|
||||
}
|
||||
rescue StandardError
|
||||
''
|
||||
end
|
||||
|
||||
def quoted_text_regexes
|
||||
|
||||
Reference in New Issue
Block a user