when auto-detecting story titles, try to find a canonical url and use it

should remedy duplicate stories being submitted under all of their
stupid blogspot.* domains instead of the canonical url represented
in the <link> tag
This commit is contained in:
joshua stein 2015-03-10 17:41:40 -05:00
parent 491a3d57e8
commit 6695480fdb
4 changed files with 100 additions and 58 deletions

View file

@ -238,12 +238,16 @@ var _Lobsters = Class.extend({
button.prop("disabled", true);
button.val("Fetching...");
$.post("/stories/fetch_url_title", {
$.post("/stories/fetch_url_attributes", {
fetch_url: url_field.val(),
})
.success(function(data) {
if (data && data.title)
title_field.val(data.title.substr(0, title_field.maxLength));
if (data) {
if (data.title)
title_field.val(data.title.substr(0, title_field.maxLength));
if (data.url)
url_field.val(data.url);
}
button.val(old_value);
button.prop("disabled", false);

View file

@ -58,15 +58,22 @@ class StoriesController < ApplicationController
end
end
def fetch_url_title
def fetch_url_attributes
s = Story.new
s.fetching_ip = request.remote_ip
s.url = params[:fetch_url]
if (title = s.fetched_title(request.remote_ip)).present?
return render :json => { :title => title }
else
return render :json => "error"
oattrs = { :url => params[:fetch_url], :title => nil }
if (title = s.fetched_title).present?
oattrs[:title] = title
end
if (cu = s.fetched_canonical_url).present?
oattrs[:url] = cu
end
return render :json => oattrs
end
def new
@ -74,10 +81,17 @@ class StoriesController < ApplicationController
@cur_url = "/stories/new"
@story = Story.new
@story.fetching_ip = request.remote_ip
if params[:url].present?
@story.url = params[:url]
if (cu = @story.fetched_canonical_url).present? && @story.url != cu
flash.now[:notice] = "Note: URL has been changed to fetched " <<
"canonicalized version"
@story.url = cu
end
if s = Story.find_similar_by_url(@story.url)
if s.is_recent?
# user won't be able to submit this story as new, so just redirect
@ -91,7 +105,7 @@ class StoriesController < ApplicationController
end
# ignore what the user brought unless we need it as a fallback
@story.title = @story.fetched_title(request.remote_ip)
@story.title = @story.fetched_title
if !@story.title.present? && params[:title].present?
@story.title = params[:title]
end

View file

@ -30,9 +30,10 @@ class Story < ActiveRecord::Base
# days a story is considered recent, for resubmitting
RECENT_DAYS = 30
attr_accessor :vote, :already_posted_story, :fetched_content, :previewing,
:seen_previous, :is_hidden_by_cur_user
attr_accessor :vote, :already_posted_story, :previewing, :seen_previous,
:is_hidden_by_cur_user
attr_accessor :editor, :moderation_reason, :merge_story_short_id
attr_accessor :fetching_ip
before_validation :assign_short_id_and_upvote,
:on => :create
@ -237,52 +238,6 @@ class Story < ActiveRecord::Base
end
end
def fetched_content(for_remote_ip = nil)
return @fetched_content if @fetched_content
begin
s = Sponge.new
s.timeout = 3
@fetched_content = s.fetch(self.url, :get, nil, nil,
{ "User-agent" => "#{Rails.application.domain} for #{for_remote_ip}" },
3)
rescue
end
@fetched_content
end
def fetched_title(for_remote_ip = nil)
title = ""
if !(doc = Nokogiri::HTML(fetched_content(for_remote_ip).to_s))
return title
end
# try <meta property="og:title"> first, it probably won't have the site
# name
begin
title = doc.at_css("meta[property='og:title']").
attributes["content"].text
rescue
end
# then try <meta name="title">
if title.to_s == ""
begin
title = doc.at_css("meta[name='title']").attributes["content"].text
rescue
end
end
# then try plain old <title>
if title.to_s == ""
title = doc.at_css("title").try(:text).to_s
end
return title
end
def generated_markeddown_description
Markdowner.to_html(self.description, { :allow_images => true })
end
@ -576,4 +531,73 @@ class Story < ActiveRecord::Base
end
}.join(", ")
end
def fetched_content
return @fetched_content if @fetched_content
begin
s = Sponge.new
s.timeout = 3
@fetched_content = s.fetch(self.url, :get, nil, nil,
{ "User-agent" => "#{Rails.application.domain} for #{self.fetching_ip}" },
3)
rescue
end
@fetched_content
end
def parsed_content
return @parsed_content if @parsed_content
@parsed_content = Nokogiri::HTML(self.fetched_content.to_s)
end
def fetched_title
title = ""
if !(doc = self.parsed_content)
return title
end
# try <meta property="og:title"> first, it probably won't have the site
# name
begin
title = doc.at_css("meta[property='og:title']").
attributes["content"].text
rescue
end
# then try <meta name="title">
if title.to_s == ""
begin
title = doc.at_css("meta[name='title']").attributes["content"].text
rescue
end
end
# then try plain old <title>
if title.to_s == ""
title = doc.at_css("title").try(:text).to_s
end
return title
end
def fetched_canonical_url
return @fetched_canonical_url if @fetched_canonical_url
if doc = self.parsed_content
begin
if (cu = doc.at_css("link[rel='canonical']").attributes["href"].
text).present? && (ucu = URI.parse(cu)) && ucu.scheme.present? &&
ucu.host.present?
return cu
end
rescue
end
end
return self.url
end
end

View file

@ -58,7 +58,7 @@ Lobsters::Application.routes.draw do
post "hide"
post "unhide"
end
post "/stories/fetch_url_title", :format => "json"
post "/stories/fetch_url_attributes", :format => "json"
post "/stories/preview" => "stories#preview"
resources :comments do