do better at finding near-similar urls already posted recently
http -> https, trailing slash, etc.
This commit is contained in:
parent
6f3f018598
commit
389b4c61ec
|
@ -28,8 +28,7 @@ class Story < ActiveRecord::Base
|
||||||
# URI.parse is not very lenient, so we can't use it
|
# URI.parse is not very lenient, so we can't use it
|
||||||
|
|
||||||
if self.url.match(/\Ahttps?:\/\/([^\.]+\.)+[a-z]+(\/|\z)/)
|
if self.url.match(/\Ahttps?:\/\/([^\.]+\.)+[a-z]+(\/|\z)/)
|
||||||
if self.new_record? && (s = Story.find_by_url(self.url)) &&
|
if self.new_record? && (s = Story.find_recent_similar_by_url(self.url))
|
||||||
(Time.now - s.created_at) < 30.days
|
|
||||||
errors.add(:url, "has already been submitted recently")
|
errors.add(:url, "has already been submitted recently")
|
||||||
self.already_posted_story = s
|
self.already_posted_story = s
|
||||||
end
|
end
|
||||||
|
@ -46,6 +45,25 @@ class Story < ActiveRecord::Base
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.find_recent_similar_by_url(url)
|
||||||
|
urls = [ url ]
|
||||||
|
urls.push url.gsub(/^http:\/\//, "https://")
|
||||||
|
urls.push url.gsub(/^https:\/\//, "http://")
|
||||||
|
urls.push url.gsub(/^http:\/\//, "https://").gsub(/\/+\z/, "")
|
||||||
|
urls.push url.gsub(/^https:\/\//, "http://").gsub(/\/+\z/, "")
|
||||||
|
urls.push url.gsub(/^http:\/\//, "https://") << "/"
|
||||||
|
urls.push url.gsub(/^https:\/\//, "http://") << "/"
|
||||||
|
|
||||||
|
urls.uniq.each do |url|
|
||||||
|
if s = Story.find(:first, :conditions => [ "created_at >= ? AND url = ?",
|
||||||
|
(Time.now - 30.days), url ])
|
||||||
|
return s
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
false
|
||||||
|
end
|
||||||
|
|
||||||
def assign_short_id
|
def assign_short_id
|
||||||
10.times do |try|
|
10.times do |try|
|
||||||
if try == 10
|
if try == 10
|
||||||
|
@ -194,6 +212,19 @@ class Story < ActiveRecord::Base
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def url=(u)
|
||||||
|
# strip out stupid google analytics parameters
|
||||||
|
if u && (m = u.match(/\A([^\?]+)\?(.+)\z/))
|
||||||
|
params = m[2].split("&")
|
||||||
|
params.reject!{|p|
|
||||||
|
p.match(/^utm_(source|medium|campaign|term|content)=/) }
|
||||||
|
|
||||||
|
u = m[1] << (params.any?? "?" << params.join("&") : "")
|
||||||
|
end
|
||||||
|
|
||||||
|
self[:url] = u
|
||||||
|
end
|
||||||
|
|
||||||
def title=(t)
|
def title=(t)
|
||||||
# change unicode whitespace characters into real spaces
|
# change unicode whitespace characters into real spaces
|
||||||
self[:title] = t.strip
|
self[:title] = t.strip
|
||||||
|
|
|
@ -34,25 +34,54 @@ describe Story do
|
||||||
end
|
end
|
||||||
|
|
||||||
it "checks for invalid urls" do
|
it "checks for invalid urls" do
|
||||||
expect { Story.make!(:title => "test", :url => "http://gooses.com/")
|
expect { Story.make!(:url => "http://gooses.com/") }.to_not raise_error
|
||||||
}.to_not raise_error
|
|
||||||
|
|
||||||
expect { Story.make!(:title => "test", url => "ftp://gooses/")
|
expect { Story.make!(:url => "ftp://gooses/") }.to raise_error
|
||||||
}.to raise_error
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it "checks for a previously posted story with same url" do
|
it "removes crap from urls" do
|
||||||
Story.count.should == 0
|
Story.make!(:url => "http://www.example.com/").
|
||||||
|
url.should == "http://www.example.com/"
|
||||||
|
Story.delete_all
|
||||||
|
|
||||||
Story.make!(:title => "flim flam", :url => "http://example.com/")
|
Story.make!(:url => "http://www.example.com/?utm_campaign=Spam").
|
||||||
|
url.should == "http://www.example.com/"
|
||||||
|
Story.delete_all
|
||||||
|
|
||||||
|
Story.make!(:url => "http://www.example.com/?utm_campaign=Spam&hello=hi").
|
||||||
|
url.should == "http://www.example.com/?hello=hi"
|
||||||
|
Story.delete_all
|
||||||
|
end
|
||||||
|
|
||||||
|
it "finds similar urls" do
|
||||||
|
s = Story.make!(:url => "https://example.com/something")
|
||||||
Story.count.should == 1
|
Story.count.should == 1
|
||||||
|
|
||||||
expect { Story.make!(:title => "flim flam 2",
|
new_s = Story.make(:url => "http://example.com/something")
|
||||||
:url => "http://example.com/") }.to raise_error
|
new_s.save.should == false
|
||||||
|
new_s.already_posted_story.should == s
|
||||||
|
|
||||||
|
new_s = Story.make(:url => "http://example.com/something/")
|
||||||
|
new_s.save.should == false
|
||||||
|
new_s.already_posted_story.should == s
|
||||||
|
|
||||||
|
new_s = Story.make(:url => "http://example.com/something/")
|
||||||
|
new_s.save.should == false
|
||||||
|
new_s.already_posted_story.should == s
|
||||||
|
|
||||||
Story.count.should == 1
|
Story.count.should == 1
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "ignores similar urls from long ago" do
|
||||||
|
new_s = Story.make(:created_at => 31.days.ago,
|
||||||
|
:url => "http://example.com/something")
|
||||||
|
new_s.save.should == true
|
||||||
|
Story.count.should == 1
|
||||||
|
|
||||||
|
new_s = Story.make(:url => "http://example.com/something")
|
||||||
|
new_s.save.should == true
|
||||||
|
end
|
||||||
|
|
||||||
it "parses domain properly" do
|
it "parses domain properly" do
|
||||||
s = Story.make!(:url => "http://example.com")
|
s = Story.make!(:url => "http://example.com")
|
||||||
s.domain.should == "example.com"
|
s.domain.should == "example.com"
|
||||||
|
|
Loading…
Reference in a new issue