move inbound e-mail parsing guts into an extra, add tests for it

This commit is contained in:
joshua stein 2014-02-03 19:05:51 -06:00
parent 2f55605645
commit b75468a2c9
7 changed files with 279 additions and 93 deletions

110
extras/email_parser.rb Normal file
View File

@ -0,0 +1,110 @@
class EmailParser
attr_reader :sender, :recipient, :email_text, :email
def initialize(sender, recipient, email_text)
@sender = sender
@recipient = recipient
@email_text = email_text.forcibly_convert_to_utf8
@email = nil
begin
# the mail gem stupidly spams STDERR while parsing e-mail, so silence
# that stream to avoid anything getting back to postfix
Utils.silence_stream(STDERR) do
@email = Mail.read_from_string(email_text)
end
rescue
end
@sending_user = nil
@parent = nil
@body = nil
end
def user_token
@recipient.gsub(/^#{Rails.application.shortname}-/, "").gsub(/@.*/, "")
end
def been_here?
!!@email_text.match(/^X-BeenThere: #{Rails.application.shortname}-/i)
end
def sending_user
return @sending_user if @sending_user
if (user = User.where(:mailing_list_enabled => true,
:mailing_list_token => user_token).first) && user.is_active?
@sending_user = user
return user
end
end
def parent
return @parent if @parent
irt = self.email[:in_reply_to].to_s.gsub(/[^A-Za-z0-9@\.]/, "")
if m = irt.match(/^comment\.([^\.]+)\.\d+@/)
@parent = Comment.where(:short_id => m[1]).first
elsif m = irt.match(/^story\.([^\.]+)\.\d+@/)
@parent = Story.where(:short_id => m[1]).first
end
@parent
end
def body
return @body if @body
@possible_charset = nil
if self.email.multipart?
# parts[0] - multipart/alternative
# parts[0].parts[0] - text/plain
# parts[0].parts[1] - text/html
if (p = self.email.parts.first.parts.select{|p|
p.content_type.match(/text\/plain/i) }).any?
@body = p.first.body.to_s
begin
@possible_charset = p.first.content_type_parameters["charset"]
rescue
end
# parts[0] - text/plain
elsif (p = self.email.parts.select{|p|
p.content_type.match(/text\/plain/i) }).any?
@body = p.first.body.to_s
begin
@possible_charset = p.first.content_type_parameters["charset"]
rescue
end
end
# simple one-part
elsif self.email.content_type.to_s.match(/text\/plain/)
@body = self.email.body.to_s
begin
@possible_charset = self.email.content_type_parameters["charset"]
rescue
end
elsif !self.email.content_type.to_s.present?
# no content-type header, assume it's text/plain
@body = self.email.body.to_s
end
# TODO: use @possible_charset, but did previously forcing the entire
# email_text to utf8 screw this up already?
# try to remove sig lines
@body.gsub!(/^-- \n.+\z/m, "")
# TODO: try to strip out attribution line, followed by an optional blank
# line, and then lines prefixed with >
@body.strip!
end
end

View File

@ -24,125 +24,47 @@ EX_NOUSER = 67
EX_TEMPFAIL = 75
EX_UNAVAILABLE = 69
recipient = ARGV[0]
user_token = recipient.gsub(/^#{Rails.application.shortname}-/, "").
gsub(/@.*/, "")
sender = ARGV[1]
message = ""
email = nil
while !STDIN.eof?
message += STDIN.gets.to_s
end
message = message.forcibly_convert_to_utf8
parser = EmailParser.new(sender = ARGV[1], recipient = ARGV[0], message)
if message.match(/^X-BeenThere: #{Rails.application.shortname}-/i)
# avoid looping
if parser.been_here?
# avoid looping, quietly
exit
end
sending_user = User.where(:mailing_list_enabled => true,
:mailing_list_token => user_token).first
if !sending_user || !sending_user.is_active?
STDERR.puts "no user with mailing list token #{user_token}"
elsif !parser.sending_user
STDERR.puts "no active user with mailing list token #{parser.user_token}"
# if this looks like a user token but invalid, generate a bounce to be
# helpful. otherwise supress it to avoid talking back to spammers
exit(recipient.match(/^#{Rails.application.shortname}-/) ? EX_NOUSER : 0)
end
exit(parser.user_token ? EX_NOUSER : 0)
# the mail gem stupidly spams STDERR while parsing e-mail, so silence that
# stream to avoid anything getting back to postfix
begin
Utils.silence_stream(STDERR) do
email = Mail.read_from_string(message)
end
if !email
raise
end
rescue
elsif !parser.email
STDERR.puts "error parsing e-mail"
exit EX_UNAVAILABLE
end
# figure out what this reply is to
irt = email[:in_reply_to].to_s.gsub(/[^A-Za-z0-9@\.]/, "")
if m = irt.match(/^comment\.([^\.]+)\.\d+@/)
parent = Comment.where(:short_id => m[1]).first
elsif m = irt.match(/^story\.([^\.]+)\.\d+@/)
parent = Story.where(:short_id => m[1]).first
end
if !parent
elsif !parser.parent
STDERR.puts "no valid comment or story being replied to"
exit EX_NOUSER
end
body = nil
possible_charset = nil
if email.multipart?
# parts[0] - multipart/alternative
# parts[0].parts[0] - text/plain
# parts[0].parts[1] - text/html
if (p = email.parts.first.parts.select{|p|
p.content_type.match(/text\/plain/) }).any?
begin
possible_charset = p.first.content_type_parameters["charset"]
rescue
end
# parts[0] - text/plain
elsif (p = email.parts.select{|p|
p.content_type.match(/text\/plain/) }).any?
body = p.first.body.to_s
begin
possible_charset = p.first.content_type_parameters["charset"]
rescue
end
end
elsif email.content_type.to_s.match(/text\/plain/)
body = email.body.to_s
begin
possible_charset = email.content_type_parameters["charset"]
rescue
end
elsif !email.content_type.to_s.present?
# no content-type header, assume it's text/plain
body = email.body.to_s
end
if !body.present?
# oh well
elsif !parser.body.present?
STDERR.puts "no valid text/plain body found"
exit EX_UNAVAILABLE
end
# try to remove sig lines
body.gsub!(/^-- \n.+\z/, "")
# TODO: try to strip out attribution line, followed by an optional blank line,
# and then lines prefixed with >
body.strip!
c = Comment.new
c.user_id = sending_user.id
c.comment = body
c.user_id = parser.sending_user.id
c.comment = parser.body
c.is_from_email = true
if parent.is_a?(Comment)
c.story_id = parent.story_id
c.parent_comment_id = parent.id
if parser.parent.is_a?(Comment)
c.story_id = parser.parent.story_id
c.parent_comment_id = parser.parent.id
else
c.story_id = parent.id
c.story_id = parser.parent.id
end
if c.save

15
spec/fixtures/inbound_emails/1.eml vendored Normal file
View File

@ -0,0 +1,15 @@
Date: Tue, 16 Jul 2013 10:37:11 -0500
From: joshua stein <jcs@jcs.org>
To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs
Subject: Re: Lobsters by mail [announce]
Message-ID: <20130716103519.78e3be260a@5d7607215bd9704>
References: <story.jg3eet.1372275536@lobste.rs>
<comment.nh0wig.1373986371@lobste.rs>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <comment.##COMMENT_ID##.1373986371@lobste.rs>
X-No-Archive: Yes
It hasn't decreased any measurable amount but since the traffic to
the site is increasing a bit each week, it's hard to tell.

36
spec/fixtures/inbound_emails/2.eml vendored Normal file
View File

@ -0,0 +1,36 @@
Return-Path: <nobody@lobste.rs>
X-Original-To: jcs@jcs.org
Delivered-To: superblock.net-jcs@filter
Received: from vmail.superblock.net (localhost.superblock.net [127.0.0.1])
by vmail.superblock.net (Postfix) with ESMTP id 1657A391FB
for <jcs@jcs.org>; Tue, 16 Jul 2013 10:40:08 -0500 (CDT)
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=lobste.rs; h=from:reply-to
:to:list-id:list-unsubscribe:content-type:message-id:in-reply-to
:references:date:subject; s=20120304; bh=wfuy7M5Juh21GniDc9U1TGw
mqLo=; b=1UFXtiD7QtaaoXfqnVXFMperUlfxsPV8HbPVrkwiB8Yzi5L2AkRdlpD
hunv1x7u1U/VXhHI1eOkp0LkSUc6Zi7legiyPXnWvOye3cTbOmp29KUhLls9gjGD
7inNgqR52uRB6NTxdDtrMZYDjUXJtHZdlLwgvcB6BKj7WPtvWx/A=
DomainKey-Signature: a=rsa-sha1; c=nofws; d=lobste.rs; h=from:reply-to
:to:list-id:list-unsubscribe:content-type:message-id:in-reply-to
:references:date:subject; q=dns; s=20120304; b=WGF2WkJDYzLWJ8jhS
TNERxmtU0qQRqI04yIuIJNHDsr0pwGR01phyvdoZfrJP+RJFDQ7SbTCl+qvXjdcJ
XRn+8zLEd1Mg8Hy2PzZuBVLMxXcJ+WGFTxUbArupByqp9qJnCPGusJmrCEIQCC+N
KqOAQotiWz9B5x5oFaBj97ZbSM=
To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs
List-Id: Lobsters <##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs>
List-Unsubscribe: <https://lobste.rs/settings>
Precedence: list
Content-Type: text/plain; charset="us-ascii"
Message-ID: <comment.jow5ro.1373989038@lobste.rs>
In-Reply-To: <comment.nh0wig.1373986371@lobste.rs>
References: <story.jg3eet.1372275536@lobste.rs>
<comment.nh0wig.1373986371@lobste.rs>
Date: Tue, 16 Jul 2013 10:37:18 -0500
Subject: Re: Lobsters by mail [announce]
X-BeenThere: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs
It hasn't decreased any measurable amount but since the traffic to
the site is increasing a bit each week, it's hard to tell.
--
Vote: https://lobste.rs/s/jg3eet/_/comments/jow5ro

18
spec/fixtures/inbound_emails/3.eml vendored Normal file
View File

@ -0,0 +1,18 @@
Date: Tue, 16 Jul 2013 10:37:11 -0500
From: joshua stein <jcs@jcs.org>
To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs
Subject: Re: Lobsters by mail [announce]
Message-ID: <20130716103519.78e3be260a@5d7607215bd9704>
References: <story.jg3eet.1372275536@lobste.rs>
<comment.nh0wig.1373986371@lobste.rs>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <comment.##COMMENT_ID##.1373986371@lobste.rs>
X-No-Archive: Yes
It hasn't decreased any measurable amount but since the traffic to
the site is increasing a bit each week, it's hard to tell.
--
this is my signature

24
spec/fixtures/inbound_emails/4.eml vendored Normal file
View File

@ -0,0 +1,24 @@
Date: Tue, 16 Jul 2013 10:37:11 -0500
From: joshua stein <jcs@jcs.org>
To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs
Subject: Re: Lobsters by mail [announce]
Message-ID: <20130716103519.78e3be260a@5d7607215bd9704>
References: <story.jg3eet.1372275536@lobste.rs>
<comment.nh0wig.1373986371@lobste.rs>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <comment.##COMMENT_ID##.1373986371@lobste.rs>
X-No-Archive: Yes
On Sun, Feb 2, 2014 at 11:51 PM, blah <blah@lobste.rs> wrote:
> This is some quoted text.
> With an attribution line
It hasn't decreased any measurable amount but since the traffic to
the site is increasing a bit each week, it's hard to tell.
--
this is my signature

View File

@ -0,0 +1,61 @@
require "spec_helper"
describe EmailParser do
before(:each) do
@user = User.make!
@story = Story.make!(:user => @user)
@commentor = User.make!
@comment = Comment.make!(:story => @story, :user => @commentor)
@emailer = User.make!(:mailing_list_enabled => true)
@emails = {}
Dir.glob("#{Rails.root}/spec/fixtures/inbound_emails/*.eml").
each do |f|
@emails[File.basename(f).gsub(/\..*/, "")] = File.read(f).
gsub(/##SHORTNAME##/, Rails.application.shortname).
gsub(/##MAILING_LIST_TOKEN##/, @emailer.mailing_list_token).
gsub(/##COMMENT_ID##/, @comment.short_id)
end
end
it "can parse a valid e-mail" do
parser = EmailParser.new(
"user@example.com",
Rails.application.shortname +
"-#{@emailer.mailing_list_token}@example.org",
@emails["1"])
parser.should_not == nil
parser.email.should_not == nil
parser.user_token.should == @emailer.mailing_list_token
parser.been_here?.should == false
parser.sending_user.id.should == @emailer.id
parser.parent.class.should == Comment
end
it "rejects mailing loops" do
parser = EmailParser.new(
"user@example.com",
Rails.application.shortname +
"-#{@emailer.mailing_list_token}@example.org",
@emails["2"])
parser.email.should_not == nil
parser.been_here?.should == true
end
it "strips signatures" do
parser = EmailParser.new(
"user@example.com",
Rails.application.shortname +
"-#{@emailer.mailing_list_token}@example.org",
@emails["3"])
parser.email.should_not == nil
parser.body.should == "It hasn't decreased any measurable amount but since the traffic to\nthe site is increasing a bit each week, it's hard to tell."
end
end