From 389b4c61ec0dfbb722ba8ae66c67d21a576de99e Mon Sep 17 00:00:00 2001 From: joshua stein Date: Wed, 11 Jul 2012 14:22:26 -0500 Subject: [PATCH] do better at finding near-similar urls already posted recently http -> https, trailing slash, etc. --- app/models/story.rb | 35 ++++++++++++++++++++++++++-- spec/models/story_spec.rb | 49 +++++++++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 12 deletions(-) diff --git a/app/models/story.rb b/app/models/story.rb index dd1fa2b..9d9c73f 100644 --- a/app/models/story.rb +++ b/app/models/story.rb @@ -28,8 +28,7 @@ class Story < ActiveRecord::Base # URI.parse is not very lenient, so we can't use it if self.url.match(/\Ahttps?:\/\/([^\.]+\.)+[a-z]+(\/|\z)/) - if self.new_record? && (s = Story.find_by_url(self.url)) && - (Time.now - s.created_at) < 30.days + if self.new_record? && (s = Story.find_recent_similar_by_url(self.url)) errors.add(:url, "has already been submitted recently") self.already_posted_story = s end @@ -46,6 +45,25 @@ class Story < ActiveRecord::Base end end + def self.find_recent_similar_by_url(url) + urls = [ url ] + urls.push url.gsub(/^http:\/\//, "https://") + urls.push url.gsub(/^https:\/\//, "http://") + urls.push url.gsub(/^http:\/\//, "https://").gsub(/\/+\z/, "") + urls.push url.gsub(/^https:\/\//, "http://").gsub(/\/+\z/, "") + urls.push url.gsub(/^http:\/\//, "https://") << "/" + urls.push url.gsub(/^https:\/\//, "http://") << "/" + + urls.uniq.each do |url| + if s = Story.find(:first, :conditions => [ "created_at >= ? AND url = ?", + (Time.now - 30.days), url ]) + return s + end + end + + false + end + def assign_short_id 10.times do |try| if try == 10 @@ -194,6 +212,19 @@ class Story < ActiveRecord::Base end end + def url=(u) + # strip out stupid google analytics parameters + if u && (m = u.match(/\A([^\?]+)\?(.+)\z/)) + params = m[2].split("&") + params.reject!{|p| + p.match(/^utm_(source|medium|campaign|term|content)=/) } + + u = m[1] << (params.any?? "?" << params.join("&") : "") + end + + self[:url] = u + end + def title=(t) # change unicode whitespace characters into real spaces self[:title] = t.strip diff --git a/spec/models/story_spec.rb b/spec/models/story_spec.rb index 9b318da..57eae7f 100644 --- a/spec/models/story_spec.rb +++ b/spec/models/story_spec.rb @@ -34,25 +34,54 @@ describe Story do end it "checks for invalid urls" do - expect { Story.make!(:title => "test", :url => "http://gooses.com/") - }.to_not raise_error + expect { Story.make!(:url => "http://gooses.com/") }.to_not raise_error - expect { Story.make!(:title => "test", url => "ftp://gooses/") - }.to raise_error + expect { Story.make!(:url => "ftp://gooses/") }.to raise_error end - it "checks for a previously posted story with same url" do - Story.count.should == 0 + it "removes crap from urls" do + Story.make!(:url => "http://www.example.com/"). + url.should == "http://www.example.com/" + Story.delete_all - Story.make!(:title => "flim flam", :url => "http://example.com/") + Story.make!(:url => "http://www.example.com/?utm_campaign=Spam"). + url.should == "http://www.example.com/" + Story.delete_all + + Story.make!(:url => "http://www.example.com/?utm_campaign=Spam&hello=hi"). + url.should == "http://www.example.com/?hello=hi" + Story.delete_all + end + + it "finds similar urls" do + s = Story.make!(:url => "https://example.com/something") Story.count.should == 1 - expect { Story.make!(:title => "flim flam 2", - :url => "http://example.com/") }.to raise_error - + new_s = Story.make(:url => "http://example.com/something") + new_s.save.should == false + new_s.already_posted_story.should == s + + new_s = Story.make(:url => "http://example.com/something/") + new_s.save.should == false + new_s.already_posted_story.should == s + + new_s = Story.make(:url => "http://example.com/something/") + new_s.save.should == false + new_s.already_posted_story.should == s + Story.count.should == 1 end + it "ignores similar urls from long ago" do + new_s = Story.make(:created_at => 31.days.ago, + :url => "http://example.com/something") + new_s.save.should == true + Story.count.should == 1 + + new_s = Story.make(:url => "http://example.com/something") + new_s.save.should == true + end + it "parses domain properly" do s = Story.make!(:url => "http://example.com") s.domain.should == "example.com"