From b75468a2c930519dc7459fb3ad99f358dcd20e23 Mon Sep 17 00:00:00 2001 From: joshua stein Date: Mon, 3 Feb 2014 19:05:51 -0600 Subject: [PATCH] move inbound e-mail parsing guts into an extra, add tests for it --- extras/email_parser.rb | 110 +++++++++++++++++++++++++++++ script/parse_inbound_mail | 108 ++++------------------------ spec/fixtures/inbound_emails/1.eml | 15 ++++ spec/fixtures/inbound_emails/2.eml | 36 ++++++++++ spec/fixtures/inbound_emails/3.eml | 18 +++++ spec/fixtures/inbound_emails/4.eml | 24 +++++++ spec/models/email_parser_spec.rb | 61 ++++++++++++++++ 7 files changed, 279 insertions(+), 93 deletions(-) create mode 100644 extras/email_parser.rb create mode 100644 spec/fixtures/inbound_emails/1.eml create mode 100644 spec/fixtures/inbound_emails/2.eml create mode 100644 spec/fixtures/inbound_emails/3.eml create mode 100644 spec/fixtures/inbound_emails/4.eml create mode 100644 spec/models/email_parser_spec.rb diff --git a/extras/email_parser.rb b/extras/email_parser.rb new file mode 100644 index 0000000..e22a381 --- /dev/null +++ b/extras/email_parser.rb @@ -0,0 +1,110 @@ +class EmailParser + attr_reader :sender, :recipient, :email_text, :email + + def initialize(sender, recipient, email_text) + @sender = sender + @recipient = recipient + @email_text = email_text.forcibly_convert_to_utf8 + + @email = nil + begin + # the mail gem stupidly spams STDERR while parsing e-mail, so silence + # that stream to avoid anything getting back to postfix + Utils.silence_stream(STDERR) do + @email = Mail.read_from_string(email_text) + end + rescue + end + + @sending_user = nil + @parent = nil + @body = nil + end + + def user_token + @recipient.gsub(/^#{Rails.application.shortname}-/, "").gsub(/@.*/, "") + end + + def been_here? + !!@email_text.match(/^X-BeenThere: #{Rails.application.shortname}-/i) + end + + def sending_user + return @sending_user if @sending_user + + if (user = User.where(:mailing_list_enabled => true, + :mailing_list_token => user_token).first) && user.is_active? + @sending_user = user + return user + end + end + + def parent + return @parent if @parent + + irt = self.email[:in_reply_to].to_s.gsub(/[^A-Za-z0-9@\.]/, "") + + if m = irt.match(/^comment\.([^\.]+)\.\d+@/) + @parent = Comment.where(:short_id => m[1]).first + elsif m = irt.match(/^story\.([^\.]+)\.\d+@/) + @parent = Story.where(:short_id => m[1]).first + end + + @parent + end + + def body + return @body if @body + + @possible_charset = nil + + if self.email.multipart? + # parts[0] - multipart/alternative + # parts[0].parts[0] - text/plain + # parts[0].parts[1] - text/html + if (p = self.email.parts.first.parts.select{|p| + p.content_type.match(/text\/plain/i) }).any? + @body = p.first.body.to_s + + begin + @possible_charset = p.first.content_type_parameters["charset"] + rescue + end + + # parts[0] - text/plain + elsif (p = self.email.parts.select{|p| + p.content_type.match(/text\/plain/i) }).any? + @body = p.first.body.to_s + + begin + @possible_charset = p.first.content_type_parameters["charset"] + rescue + end + end + + # simple one-part + elsif self.email.content_type.to_s.match(/text\/plain/) + @body = self.email.body.to_s + + begin + @possible_charset = self.email.content_type_parameters["charset"] + rescue + end + + elsif !self.email.content_type.to_s.present? + # no content-type header, assume it's text/plain + @body = self.email.body.to_s + end + + # TODO: use @possible_charset, but did previously forcing the entire + # email_text to utf8 screw this up already? + + # try to remove sig lines + @body.gsub!(/^-- \n.+\z/m, "") + + # TODO: try to strip out attribution line, followed by an optional blank + # line, and then lines prefixed with > + + @body.strip! + end +end diff --git a/script/parse_inbound_mail b/script/parse_inbound_mail index 49ccf31..b4e0310 100755 --- a/script/parse_inbound_mail +++ b/script/parse_inbound_mail @@ -24,125 +24,47 @@ EX_NOUSER = 67 EX_TEMPFAIL = 75 EX_UNAVAILABLE = 69 -recipient = ARGV[0] -user_token = recipient.gsub(/^#{Rails.application.shortname}-/, ""). - gsub(/@.*/, "") -sender = ARGV[1] message = "" -email = nil - while !STDIN.eof? message += STDIN.gets.to_s end -message = message.forcibly_convert_to_utf8 +parser = EmailParser.new(sender = ARGV[1], recipient = ARGV[0], message) -if message.match(/^X-BeenThere: #{Rails.application.shortname}-/i) - # avoid looping +if parser.been_here? + # avoid looping, quietly exit -end -sending_user = User.where(:mailing_list_enabled => true, - :mailing_list_token => user_token).first - -if !sending_user || !sending_user.is_active? - STDERR.puts "no user with mailing list token #{user_token}" +elsif !parser.sending_user + STDERR.puts "no active user with mailing list token #{parser.user_token}" # if this looks like a user token but invalid, generate a bounce to be # helpful. otherwise supress it to avoid talking back to spammers - exit(recipient.match(/^#{Rails.application.shortname}-/) ? EX_NOUSER : 0) -end + exit(parser.user_token ? EX_NOUSER : 0) -# the mail gem stupidly spams STDERR while parsing e-mail, so silence that -# stream to avoid anything getting back to postfix -begin - Utils.silence_stream(STDERR) do - email = Mail.read_from_string(message) - end - - if !email - raise - end -rescue +elsif !parser.email STDERR.puts "error parsing e-mail" exit EX_UNAVAILABLE -end -# figure out what this reply is to -irt = email[:in_reply_to].to_s.gsub(/[^A-Za-z0-9@\.]/, "") - -if m = irt.match(/^comment\.([^\.]+)\.\d+@/) - parent = Comment.where(:short_id => m[1]).first -elsif m = irt.match(/^story\.([^\.]+)\.\d+@/) - parent = Story.where(:short_id => m[1]).first -end - -if !parent +elsif !parser.parent STDERR.puts "no valid comment or story being replied to" exit EX_NOUSER -end -body = nil -possible_charset = nil - -if email.multipart? - # parts[0] - multipart/alternative - # parts[0].parts[0] - text/plain - # parts[0].parts[1] - text/html - if (p = email.parts.first.parts.select{|p| - p.content_type.match(/text\/plain/) }).any? - begin - possible_charset = p.first.content_type_parameters["charset"] - rescue - end - - # parts[0] - text/plain - elsif (p = email.parts.select{|p| - p.content_type.match(/text\/plain/) }).any? - body = p.first.body.to_s - - begin - possible_charset = p.first.content_type_parameters["charset"] - rescue - end - end -elsif email.content_type.to_s.match(/text\/plain/) - body = email.body.to_s - - begin - possible_charset = email.content_type_parameters["charset"] - rescue - end - -elsif !email.content_type.to_s.present? - # no content-type header, assume it's text/plain - body = email.body.to_s -end - -if !body.present? - # oh well +elsif !parser.body.present? STDERR.puts "no valid text/plain body found" exit EX_UNAVAILABLE end -# try to remove sig lines -body.gsub!(/^-- \n.+\z/, "") - -# TODO: try to strip out attribution line, followed by an optional blank line, -# and then lines prefixed with > - -body.strip! - c = Comment.new -c.user_id = sending_user.id -c.comment = body +c.user_id = parser.sending_user.id +c.comment = parser.body c.is_from_email = true -if parent.is_a?(Comment) - c.story_id = parent.story_id - c.parent_comment_id = parent.id +if parser.parent.is_a?(Comment) + c.story_id = parser.parent.story_id + c.parent_comment_id = parser.parent.id else - c.story_id = parent.id + c.story_id = parser.parent.id end if c.save diff --git a/spec/fixtures/inbound_emails/1.eml b/spec/fixtures/inbound_emails/1.eml new file mode 100644 index 0000000..a52904f --- /dev/null +++ b/spec/fixtures/inbound_emails/1.eml @@ -0,0 +1,15 @@ +Date: Tue, 16 Jul 2013 10:37:11 -0500 +From: joshua stein +To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs +Subject: Re: Lobsters by mail [announce] +Message-ID: <20130716103519.78e3be260a@5d7607215bd9704> +References: + +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +In-Reply-To: +X-No-Archive: Yes + +It hasn't decreased any measurable amount but since the traffic to +the site is increasing a bit each week, it's hard to tell. diff --git a/spec/fixtures/inbound_emails/2.eml b/spec/fixtures/inbound_emails/2.eml new file mode 100644 index 0000000..48cb5c7 --- /dev/null +++ b/spec/fixtures/inbound_emails/2.eml @@ -0,0 +1,36 @@ +Return-Path: +X-Original-To: jcs@jcs.org +Delivered-To: superblock.net-jcs@filter +Received: from vmail.superblock.net (localhost.superblock.net [127.0.0.1]) + by vmail.superblock.net (Postfix) with ESMTP id 1657A391FB + for ; Tue, 16 Jul 2013 10:40:08 -0500 (CDT) +DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=lobste.rs; h=from:reply-to + :to:list-id:list-unsubscribe:content-type:message-id:in-reply-to + :references:date:subject; s=20120304; bh=wfuy7M5Juh21GniDc9U1TGw + mqLo=; b=1UFXtiD7QtaaoXfqnVXFMperUlfxsPV8HbPVrkwiB8Yzi5L2AkRdlpD + hunv1x7u1U/VXhHI1eOkp0LkSUc6Zi7legiyPXnWvOye3cTbOmp29KUhLls9gjGD + 7inNgqR52uRB6NTxdDtrMZYDjUXJtHZdlLwgvcB6BKj7WPtvWx/A= +DomainKey-Signature: a=rsa-sha1; c=nofws; d=lobste.rs; h=from:reply-to + :to:list-id:list-unsubscribe:content-type:message-id:in-reply-to + :references:date:subject; q=dns; s=20120304; b=WGF2WkJDYzLWJ8jhS + TNERxmtU0qQRqI04yIuIJNHDsr0pwGR01phyvdoZfrJP+RJFDQ7SbTCl+qvXjdcJ + XRn+8zLEd1Mg8Hy2PzZuBVLMxXcJ+WGFTxUbArupByqp9qJnCPGusJmrCEIQCC+N + KqOAQotiWz9B5x5oFaBj97ZbSM= +To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs +List-Id: Lobsters <##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs> +List-Unsubscribe: +Precedence: list +Content-Type: text/plain; charset="us-ascii" +Message-ID: +In-Reply-To: +References: + +Date: Tue, 16 Jul 2013 10:37:18 -0500 +Subject: Re: Lobsters by mail [announce] +X-BeenThere: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs + +It hasn't decreased any measurable amount but since the traffic to +the site is increasing a bit each week, it's hard to tell. + +-- +Vote: https://lobste.rs/s/jg3eet/_/comments/jow5ro diff --git a/spec/fixtures/inbound_emails/3.eml b/spec/fixtures/inbound_emails/3.eml new file mode 100644 index 0000000..ce2d076 --- /dev/null +++ b/spec/fixtures/inbound_emails/3.eml @@ -0,0 +1,18 @@ +Date: Tue, 16 Jul 2013 10:37:11 -0500 +From: joshua stein +To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs +Subject: Re: Lobsters by mail [announce] +Message-ID: <20130716103519.78e3be260a@5d7607215bd9704> +References: + +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +In-Reply-To: +X-No-Archive: Yes + +It hasn't decreased any measurable amount but since the traffic to +the site is increasing a bit each week, it's hard to tell. + +-- +this is my signature diff --git a/spec/fixtures/inbound_emails/4.eml b/spec/fixtures/inbound_emails/4.eml new file mode 100644 index 0000000..1e1f179 --- /dev/null +++ b/spec/fixtures/inbound_emails/4.eml @@ -0,0 +1,24 @@ +Date: Tue, 16 Jul 2013 10:37:11 -0500 +From: joshua stein +To: ##SHORTNAME##-##MAILING_LIST_TOKEN##@lobste.rs +Subject: Re: Lobsters by mail [announce] +Message-ID: <20130716103519.78e3be260a@5d7607215bd9704> +References: + +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +In-Reply-To: +X-No-Archive: Yes + +On Sun, Feb 2, 2014 at 11:51 PM, blah wrote: + +> This is some quoted text. +> With an attribution line + +It hasn't decreased any measurable amount but since the traffic to +the site is increasing a bit each week, it's hard to tell. + +-- +this is my signature + diff --git a/spec/models/email_parser_spec.rb b/spec/models/email_parser_spec.rb new file mode 100644 index 0000000..36735ec --- /dev/null +++ b/spec/models/email_parser_spec.rb @@ -0,0 +1,61 @@ +require "spec_helper" + +describe EmailParser do + before(:each) do + @user = User.make! + @story = Story.make!(:user => @user) + + @commentor = User.make! + @comment = Comment.make!(:story => @story, :user => @commentor) + + @emailer = User.make!(:mailing_list_enabled => true) + + @emails = {} + Dir.glob("#{Rails.root}/spec/fixtures/inbound_emails/*.eml"). + each do |f| + @emails[File.basename(f).gsub(/\..*/, "")] = File.read(f). + gsub(/##SHORTNAME##/, Rails.application.shortname). + gsub(/##MAILING_LIST_TOKEN##/, @emailer.mailing_list_token). + gsub(/##COMMENT_ID##/, @comment.short_id) + end + end + + it "can parse a valid e-mail" do + parser = EmailParser.new( + "user@example.com", + Rails.application.shortname + + "-#{@emailer.mailing_list_token}@example.org", + @emails["1"]) + + parser.should_not == nil + parser.email.should_not == nil + + parser.user_token.should == @emailer.mailing_list_token + parser.been_here?.should == false + parser.sending_user.id.should == @emailer.id + + parser.parent.class.should == Comment + end + + it "rejects mailing loops" do + parser = EmailParser.new( + "user@example.com", + Rails.application.shortname + + "-#{@emailer.mailing_list_token}@example.org", + @emails["2"]) + + parser.email.should_not == nil + parser.been_here?.should == true + end + + it "strips signatures" do + parser = EmailParser.new( + "user@example.com", + Rails.application.shortname + + "-#{@emailer.mailing_list_token}@example.org", + @emails["3"]) + + parser.email.should_not == nil + parser.body.should == "It hasn't decreased any measurable amount but since the traffic to\nthe site is increasing a bit each week, it's hard to tell." + end +end