chore: Improve email parsing using email trimmer gem (#3611)
Email parsing using email_trimmer gem Fixes: #3539 , #2954, #3572
This commit is contained in:
4
Gemfile
4
Gemfile
@@ -121,6 +121,10 @@ gem 'hairtrigger'
|
|||||||
|
|
||||||
gem 'procore-sift'
|
gem 'procore-sift'
|
||||||
|
|
||||||
|
# parse email
|
||||||
|
gem 'email_reply_trimmer'
|
||||||
|
gem 'html2text'
|
||||||
|
|
||||||
group :production, :staging do
|
group :production, :staging do
|
||||||
# we dont want request timing out in development while using byebug
|
# we dont want request timing out in development while using byebug
|
||||||
gem 'rack-timeout'
|
gem 'rack-timeout'
|
||||||
|
|||||||
@@ -179,6 +179,7 @@ GEM
|
|||||||
addressable (~> 2.8)
|
addressable (~> 2.8)
|
||||||
ecma-re-validator (0.3.0)
|
ecma-re-validator (0.3.0)
|
||||||
regexp_parser (~> 2.0)
|
regexp_parser (~> 2.0)
|
||||||
|
email_reply_trimmer (0.1.13)
|
||||||
erubi (1.10.0)
|
erubi (1.10.0)
|
||||||
erubis (2.7.0)
|
erubis (2.7.0)
|
||||||
et-orbi (1.2.5)
|
et-orbi (1.2.5)
|
||||||
@@ -290,6 +291,8 @@ GEM
|
|||||||
hashdiff (1.0.1)
|
hashdiff (1.0.1)
|
||||||
hashie (4.1.0)
|
hashie (4.1.0)
|
||||||
hkdf (0.3.0)
|
hkdf (0.3.0)
|
||||||
|
html2text (0.2.1)
|
||||||
|
nokogiri (~> 1.6)
|
||||||
http-accept (1.7.0)
|
http-accept (1.7.0)
|
||||||
http-cookie (1.0.4)
|
http-cookie (1.0.4)
|
||||||
domain_name (~> 0.5)
|
domain_name (~> 0.5)
|
||||||
@@ -668,6 +671,7 @@ DEPENDENCIES
|
|||||||
devise_token_auth
|
devise_token_auth
|
||||||
dotenv-rails
|
dotenv-rails
|
||||||
down (~> 5.0)
|
down (~> 5.0)
|
||||||
|
email_reply_trimmer
|
||||||
facebook-messenger
|
facebook-messenger
|
||||||
factory_bot_rails
|
factory_bot_rails
|
||||||
faker
|
faker
|
||||||
@@ -682,6 +686,7 @@ DEPENDENCIES
|
|||||||
haikunator
|
haikunator
|
||||||
hairtrigger
|
hairtrigger
|
||||||
hashie
|
hashie
|
||||||
|
html2text
|
||||||
image_processing
|
image_processing
|
||||||
jbuilder
|
jbuilder
|
||||||
json_refs
|
json_refs
|
||||||
|
|||||||
31
app/presenters/html_parser.rb
Normal file
31
app/presenters/html_parser.rb
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
class HtmlParser
|
||||||
|
def self.parse_reply(raw_body)
|
||||||
|
new(raw_body).filtered_text
|
||||||
|
end
|
||||||
|
|
||||||
|
attr_reader :raw_body
|
||||||
|
|
||||||
|
def initialize(raw_body)
|
||||||
|
@raw_body = raw_body
|
||||||
|
end
|
||||||
|
|
||||||
|
def document
|
||||||
|
@document ||= Nokogiri::HTML(raw_body)
|
||||||
|
end
|
||||||
|
|
||||||
|
def filter_replies!
|
||||||
|
document.xpath('//blockquote').each { |n| n.replace('> ') }
|
||||||
|
document.xpath('//table').each(&:remove)
|
||||||
|
end
|
||||||
|
|
||||||
|
def filtered_html
|
||||||
|
@filtered_html ||= begin
|
||||||
|
filter_replies!
|
||||||
|
document.inner_html
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def filtered_text
|
||||||
|
@filtered_text ||= Html2Text.convert(filtered_html)
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -8,30 +8,48 @@ class MailPresenter < SimpleDelegator
|
|||||||
end
|
end
|
||||||
|
|
||||||
def subject
|
def subject
|
||||||
encode_to_unicode(@mail.subject || '')
|
encode_to_unicode(@mail.subject)
|
||||||
end
|
end
|
||||||
|
|
||||||
def text_content
|
def text_content
|
||||||
@decoded_text_content ||= encode_to_unicode(text_part&.decoded || decoded_message || '')
|
@decoded_text_content = select_body || ''
|
||||||
|
encoding = @decoded_text_content.encoding
|
||||||
|
|
||||||
|
body = EmailReplyTrimmer.trim(@decoded_text_content)
|
||||||
|
|
||||||
return {} if @decoded_text_content.blank?
|
return {} if @decoded_text_content.blank?
|
||||||
|
|
||||||
@text_content ||= {
|
@text_content ||= {
|
||||||
full: @decoded_text_content,
|
full: select_body,
|
||||||
reply: extract_reply(@decoded_text_content)[:reply],
|
reply: @decoded_text_content,
|
||||||
quoted: extract_reply(@decoded_text_content)[:quoted_text]
|
quoted: body.force_encoding(encoding).encode('UTF-8')
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def select_body
|
||||||
|
message = mail.text_part || mail.html_part || mail
|
||||||
|
decoded = encode_to_unicode(message.decoded)
|
||||||
|
# Certain trigger phrases that means we didn't parse correctly
|
||||||
|
return '' if %r{(Content-Type: multipart/alternative|text/plain)}.match?(decoded)
|
||||||
|
|
||||||
|
if (mail.content_type || '').include? 'text/html'
|
||||||
|
::HtmlParser.parse_reply(decoded)
|
||||||
|
else
|
||||||
|
decoded
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def html_content
|
def html_content
|
||||||
@decoded_html_content ||= encode_to_unicode(html_part&.decoded)
|
@decoded_html_content = select_body || ''
|
||||||
|
|
||||||
return {} if @decoded_html_content.blank?
|
return {} if @decoded_html_content.blank?
|
||||||
|
|
||||||
|
body = EmailReplyTrimmer.trim(@decoded_html_content)
|
||||||
|
|
||||||
@html_content ||= {
|
@html_content ||= {
|
||||||
full: @decoded_html_content,
|
full: select_body,
|
||||||
reply: extract_reply(@decoded_html_content)[:reply],
|
reply: @decoded_html_content,
|
||||||
quoted: extract_reply(@decoded_html_content)[:quoted_text]
|
quoted: body
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -47,14 +65,6 @@ class MailPresenter < SimpleDelegator
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def decoded_message
|
|
||||||
if mail.multipart?
|
|
||||||
return mail.text_part ? mail.text_part.decoded : nil
|
|
||||||
end
|
|
||||||
|
|
||||||
mail.decoded
|
|
||||||
end
|
|
||||||
|
|
||||||
def number_of_attachments
|
def number_of_attachments
|
||||||
mail.attachments.count
|
mail.attachments.count
|
||||||
end
|
end
|
||||||
@@ -114,21 +124,8 @@ class MailPresenter < SimpleDelegator
|
|||||||
return str if current_encoding == 'UTF-8'
|
return str if current_encoding == 'UTF-8'
|
||||||
|
|
||||||
str.encode(current_encoding, 'UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
str.encode(current_encoding, 'UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
||||||
end
|
rescue StandardError
|
||||||
|
''
|
||||||
def extract_reply(content)
|
|
||||||
@regex_arr ||= quoted_text_regexes
|
|
||||||
|
|
||||||
content_length = content.length
|
|
||||||
# calculates the matching regex closest to top of page
|
|
||||||
index = @regex_arr.inject(content_length) do |min, regex|
|
|
||||||
[(content.index(regex) || content_length), min].min
|
|
||||||
end
|
|
||||||
|
|
||||||
{
|
|
||||||
reply: content[0..(index - 1)].strip,
|
|
||||||
quoted_text: content[index..].strip
|
|
||||||
}
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def quoted_text_regexes
|
def quoted_text_regexes
|
||||||
|
|||||||
47
spec/fixtures/files/mail_with_quote.eml
vendored
Normal file
47
spec/fixtures/files/mail_with_quote.eml
vendored
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
MIME-Version: 1.0
|
||||||
|
Date: Thu, 19 Aug 2021 14:14:31 +0530
|
||||||
|
References: <CAFkiBVxGoURoqdkY-O_25F-8b41kb-GWBc6hh4Djd5ynwOikXA@mail.gmail.com> <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
|
||||||
|
In-Reply-To: <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
|
||||||
|
Message-ID: <CAFkiBVwJjO_k_e-LpiKi7MAQAKbHX5nkEPcf0y1R=bjcEHogMg@mail.gmail.com>
|
||||||
|
Subject: Re: Checking mail forwarding to cw inbox
|
||||||
|
From: Sony Mathew <sony@chatwoot.com>
|
||||||
|
To: Tejaswini <reply+6bdc3f4d-0bec-4515-a284-5d916fdde489@example.com>
|
||||||
|
Content-Type: multipart/alternative; boundary="0000000000004af64505c9e58f03"
|
||||||
|
|
||||||
|
--0000000000004af64505c9e58f03
|
||||||
|
Content-Type: text/plain; charset="UTF-8"
|
||||||
|
|
||||||
|
Yes, I am providing you step how to reproduce this issue
|
||||||
|
|
||||||
|
On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test <
|
||||||
|
tejaswini@chatwoot.com> wrote:
|
||||||
|
|
||||||
|
> Any update on this?
|
||||||
|
>
|
||||||
|
>
|
||||||
|
|
||||||
|
--
|
||||||
|
* Sony Mathew*
|
||||||
|
Software developer
|
||||||
|
*Mob:9999999999
|
||||||
|
|
||||||
|
--0000000000004af64505c9e58f03
|
||||||
|
Content-Type: text/html; charset="UTF-8"
|
||||||
|
Content-Transfer-Encoding: quoted-printable
|
||||||
|
|
||||||
|
<div dir=3D"ltr">Yes, I am providing you step how to reproduce this issue</=
|
||||||
|
div><br><div class=3D"gmail_quote"><div dir=3D"ltr" class=3D"gmail_attr">On=
|
||||||
|
Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test &l=
|
||||||
|
t;<a href=3D"mailto:tejaswini@chatwoot.com">tejaswini@chatwoot.com</a>> wrot=
|
||||||
|
e:<br></div><blockquote class=3D"gmail_quote" style=3D"margin:0px 0px 0px 0=
|
||||||
|
.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> <p>
|
||||||
|
</p><p>Any update on this?</p>
|
||||||
|
|
||||||
|
<p></p>
|
||||||
|
</blockquote></div><br clear=3D"all"><div><br></div>-- <br><div dir=3D"ltr"=
|
||||||
|
class=3D"gmail_signature"><div dir=3D"ltr"><div><div dir=3D"ltr"><div><div=
|
||||||
|
><b>Sony Mathew.</b><br></div><span style=3D"font-family:"times ne=
|
||||||
|
w roman",serif"><span></span><span></span>Software developer</span><br=
|
||||||
|
></div><b>Mob:9999999999</b></div></div></div></div>
|
||||||
|
|
||||||
|
--0000000000004af64505c9e58f03--
|
||||||
1061
spec/fixtures/files/welcome_html.eml
vendored
Normal file
1061
spec/fixtures/files/welcome_html.eml
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@ RSpec.describe ReplyMailbox, type: :mailbox do
|
|||||||
let(:account) { create(:account) }
|
let(:account) { create(:account) }
|
||||||
let(:agent) { create(:user, email: 'agent1@example.com', account: account) }
|
let(:agent) { create(:user, email: 'agent1@example.com', account: account) }
|
||||||
let(:reply_mail) { create_inbound_email_from_fixture('reply.eml') }
|
let(:reply_mail) { create_inbound_email_from_fixture('reply.eml') }
|
||||||
|
let(:mail_with_quote) { create_inbound_email_from_fixture('mail_with_quote.eml') }
|
||||||
let(:conversation) { create(:conversation, assignee: agent, inbox: create(:inbox, account: account, greeting_enabled: false), account: account) }
|
let(:conversation) { create(:conversation, assignee: agent, inbox: create(:inbox, account: account, greeting_enabled: false), account: account) }
|
||||||
let(:described_subject) { described_class.receive reply_mail }
|
let(:described_subject) { described_class.receive reply_mail }
|
||||||
let(:serialized_attributes) do
|
let(:serialized_attributes) do
|
||||||
@@ -95,5 +96,35 @@ RSpec.describe ReplyMailbox, type: :mailbox do
|
|||||||
expect(conversation_1.messages.last.content).to eq("Let's talk about these images:")
|
expect(conversation_1.messages.last.content).to eq("Let's talk about these images:")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'with quotes in email' do
|
||||||
|
let(:described_subject) { described_class.receive mail_with_quote }
|
||||||
|
|
||||||
|
before do
|
||||||
|
# this UUID is hardcoded in the reply.eml, that's why we are updating this
|
||||||
|
conversation.uuid = '6bdc3f4d-0bec-4515-a284-5d916fdde489'
|
||||||
|
conversation.save
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'add the mail content as new message on the conversation' do
|
||||||
|
described_subject
|
||||||
|
expect(conversation.messages.last.content).to eq(
|
||||||
|
<<-BODY.strip_heredoc.chomp
|
||||||
|
Yes, I am providing you step how to reproduce this issue
|
||||||
|
|
||||||
|
On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test < tejaswini@chatwoot.com> wrote:
|
||||||
|
|
||||||
|
> Any update on this?
|
||||||
|
>
|
||||||
|
>
|
||||||
|
|
||||||
|
--
|
||||||
|
* Sony Mathew*
|
||||||
|
Software developer
|
||||||
|
*Mob:9999999999
|
||||||
|
BODY
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
15
spec/presenters/html_parser_spec.rb
Normal file
15
spec/presenters/html_parser_spec.rb
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
require 'rails_helper'
|
||||||
|
RSpec.describe HtmlParser do
|
||||||
|
include ActionMailbox::TestHelper
|
||||||
|
|
||||||
|
describe 'parsed mail decorator' do
|
||||||
|
let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail }
|
||||||
|
|
||||||
|
it 'parse html content in the mail' do
|
||||||
|
decorated_html_mail = described_class.parse_reply(html_mail.text_part.decoded)
|
||||||
|
expect(decorated_html_mail[0..70]).to eq(
|
||||||
|
"I'm learning English as a first language for the past 13 years, but to "
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -4,6 +4,7 @@ RSpec.describe MailPresenter do
|
|||||||
|
|
||||||
describe 'parsed mail decorator' do
|
describe 'parsed mail decorator' do
|
||||||
let(:mail) { create_inbound_email_from_fixture('welcome.eml').mail }
|
let(:mail) { create_inbound_email_from_fixture('welcome.eml').mail }
|
||||||
|
let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail }
|
||||||
let(:decorated_mail) { described_class.new(mail) }
|
let(:decorated_mail) { described_class.new(mail) }
|
||||||
|
|
||||||
let(:mail_with_no_subject) { create_inbound_email_from_fixture('mail_with_no_subject.eml').mail }
|
let(:mail_with_no_subject) { create_inbound_email_from_fixture('mail_with_no_subject.eml').mail }
|
||||||
@@ -56,5 +57,13 @@ RSpec.describe MailPresenter do
|
|||||||
it 'give email from in downcased format' do
|
it 'give email from in downcased format' do
|
||||||
expect(decorated_mail.from.first.eql?(mail.from.first.downcase)).to eq true
|
expect(decorated_mail.from.first.eql?(mail.from.first.downcase)).to eq true
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'parse html content in the mail' do
|
||||||
|
decorated_html_mail = described_class.new(html_mail)
|
||||||
|
expect(decorated_html_mail.subject).to eq('Fwd: How good are you in English? How did you improve your English?')
|
||||||
|
expect(decorated_html_mail.text_content[:reply][0..70]).to eq(
|
||||||
|
"I'm learning English as a first language for the past 13 years, but to "
|
||||||
|
)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user