add similar-url check for www\d*\., rewrite checker to be simpler

This commit is contained in:
joshua stein 2012-07-17 18:17:46 -05:00
parent 70db7a5879
commit c945f29040
2 changed files with 45 additions and 49 deletions

View file

@ -67,18 +67,38 @@ class Story < ActiveRecord::Base
def self.find_recent_similar_by_url(url)
urls = [ url ]
urls.push url.gsub(/^http:\/\//, "https://")
urls.push url.gsub(/^https:\/\//, "http://")
urls.push url.gsub(/^http:\/\//, "https://").gsub(/\/+\z/, "")
urls.push url.gsub(/^https:\/\//, "http://").gsub(/\/+\z/, "")
urls.push url.gsub(/^http:\/\//, "https://") << "/"
urls.push url.gsub(/^https:\/\//, "http://") << "/"
urls2 = [ url ]
urls.uniq.each do |url|
if s = Story.find(:first, :conditions => [ "created_at >= ? AND url = ?",
(Time.now - 30.days), url ])
return s
end
# https
urls.each do |u|
urls2.push u.gsub(/^http:\/\//i, "https://")
urls2.push u.gsub(/^https:\/\//i, "http://")
end
urls = urls2.clone
# trailing slash
urls.each do |u|
urls2.push u.gsub(/\/+\z/, "")
urls2.push (u << "/")
end
urls = urls2.clone
# www prefix
urls.each do |u|
urls2.push u.gsub(/^(https?:\/\/)www\d*\./i) {|_| $1 }
urls2.push u.gsub(/^(https?:\/\/)/i) {|_| "#{$1}www." }
end
urls = urls2.clone
conds = [ "created_at >= ? AND (", (Time.now - 30.days) ]
urls.uniq.each_with_index do |url,x|
conds[0] << (x == 0 ? "" : " OR ") << "url = ?"
conds.push url
end
conds[0] << ")"
if s = Story.find(:first, :conditions => conds)
return s
end
false

View file

@ -34,59 +34,35 @@ describe Story do
end
it "checks for invalid urls" do
expect { Story.make!(:url => "http://gooses.com/") }.to_not raise_error
expect { Story.make!(:title => "test", :url => "http://gooses.com/")
}.to_not raise_error
expect { Story.make!(:url => "ftp://gooses/") }.to raise_error
expect { Story.make!(:title => "test", url => "ftp://gooses/")
}.to raise_error
end
it "removes crap from urls" do
Story.make!(:url => "http://www.example.com/").
url.should == "http://www.example.com/"
Story.delete_all
it "checks for a previously posted story with same url" do
Story.count.should == 0
Story.make!(:url => "http://www.example.com/?utm_campaign=Spam").
url.should == "http://www.example.com/"
Story.delete_all
Story.make!(:url => "http://www.example.com/?utm_campaign=Spam&hello=hi").
url.should == "http://www.example.com/?hello=hi"
Story.delete_all
end
it "finds similar urls" do
s = Story.make!(:url => "https://example.com/something")
Story.make!(:title => "flim flam", :url => "http://example.com/")
Story.count.should == 1
new_s = Story.make(:url => "http://example.com/something")
new_s.save.should == false
new_s.already_posted_story.should == s
new_s = Story.make(:url => "http://example.com/something/")
new_s.save.should == false
new_s.already_posted_story.should == s
new_s = Story.make(:url => "http://example.com/something/")
new_s.save.should == false
new_s.already_posted_story.should == s
Story.count.should == 1
end
expect { Story.make!(:title => "flim flam 2",
:url => "http://example.com/") }.to raise_error
it "ignores similar urls from long ago" do
new_s = Story.make(:created_at => 31.days.ago,
:url => "http://example.com/something")
new_s.save.should == true
Story.count.should == 1
expect { Story.make!(:title => "flim flam 2",
:url => "http://www.example.com/") }.to raise_error
new_s = Story.make(:url => "http://example.com/something")
new_s.save.should == true
Story.count.should == 1
end
it "parses domain properly" do
s = Story.make!(:url => "http://example.com")
s.domain.should == "example.com"
s = Story.make!(:url => "http://www3.example.com")
s = Story.make!(:url => "http://www3.example.com/goose")
s.domain.should == "example.com"
s = Story.make!(:url => "http://flub.example.com")