do better at finding near-similar urls already posted recently

http -> https, trailing slash, etc.
This commit is contained in:
joshua stein 2012-07-11 14:22:26 -05:00
parent 6f3f018598
commit 389b4c61ec
2 changed files with 72 additions and 12 deletions

View file

@ -28,8 +28,7 @@ class Story < ActiveRecord::Base
# URI.parse is not very lenient, so we can't use it
if self.url.match(/\Ahttps?:\/\/([^\.]+\.)+[a-z]+(\/|\z)/)
if self.new_record? && (s = Story.find_by_url(self.url)) &&
(Time.now - s.created_at) < 30.days
if self.new_record? && (s = Story.find_recent_similar_by_url(self.url))
errors.add(:url, "has already been submitted recently")
self.already_posted_story = s
end
@ -46,6 +45,25 @@ class Story < ActiveRecord::Base
end
end
def self.find_recent_similar_by_url(url)
urls = [ url ]
urls.push url.gsub(/^http:\/\//, "https://")
urls.push url.gsub(/^https:\/\//, "http://")
urls.push url.gsub(/^http:\/\//, "https://").gsub(/\/+\z/, "")
urls.push url.gsub(/^https:\/\//, "http://").gsub(/\/+\z/, "")
urls.push url.gsub(/^http:\/\//, "https://") << "/"
urls.push url.gsub(/^https:\/\//, "http://") << "/"
urls.uniq.each do |url|
if s = Story.find(:first, :conditions => [ "created_at >= ? AND url = ?",
(Time.now - 30.days), url ])
return s
end
end
false
end
def assign_short_id
10.times do |try|
if try == 10
@ -194,6 +212,19 @@ class Story < ActiveRecord::Base
end
end
def url=(u)
# strip out stupid google analytics parameters
if u && (m = u.match(/\A([^\?]+)\?(.+)\z/))
params = m[2].split("&")
params.reject!{|p|
p.match(/^utm_(source|medium|campaign|term|content)=/) }
u = m[1] << (params.any?? "?" << params.join("&") : "")
end
self[:url] = u
end
def title=(t)
# change unicode whitespace characters into real spaces
self[:title] = t.strip

View file

@ -34,25 +34,54 @@ describe Story do
end
it "checks for invalid urls" do
expect { Story.make!(:title => "test", :url => "http://gooses.com/")
}.to_not raise_error
expect { Story.make!(:url => "http://gooses.com/") }.to_not raise_error
expect { Story.make!(:title => "test", url => "ftp://gooses/")
}.to raise_error
expect { Story.make!(:url => "ftp://gooses/") }.to raise_error
end
it "checks for a previously posted story with same url" do
Story.count.should == 0
it "removes crap from urls" do
Story.make!(:url => "http://www.example.com/").
url.should == "http://www.example.com/"
Story.delete_all
Story.make!(:title => "flim flam", :url => "http://example.com/")
Story.make!(:url => "http://www.example.com/?utm_campaign=Spam").
url.should == "http://www.example.com/"
Story.delete_all
Story.make!(:url => "http://www.example.com/?utm_campaign=Spam&hello=hi").
url.should == "http://www.example.com/?hello=hi"
Story.delete_all
end
it "finds similar urls" do
s = Story.make!(:url => "https://example.com/something")
Story.count.should == 1
expect { Story.make!(:title => "flim flam 2",
:url => "http://example.com/") }.to raise_error
new_s = Story.make(:url => "http://example.com/something")
new_s.save.should == false
new_s.already_posted_story.should == s
new_s = Story.make(:url => "http://example.com/something/")
new_s.save.should == false
new_s.already_posted_story.should == s
new_s = Story.make(:url => "http://example.com/something/")
new_s.save.should == false
new_s.already_posted_story.should == s
Story.count.should == 1
end
it "ignores similar urls from long ago" do
new_s = Story.make(:created_at => 31.days.ago,
:url => "http://example.com/something")
new_s.save.should == true
Story.count.should == 1
new_s = Story.make(:url => "http://example.com/something")
new_s.save.should == true
end
it "parses domain properly" do
s = Story.make!(:url => "http://example.com")
s.domain.should == "example.com"