From 9d3c6f1849120e732a9230959cb302575765ea8c Mon Sep 17 00:00:00 2001 From: ThibG Date: Thu, 28 Feb 2019 15:22:21 +0100 Subject: [PATCH] Improved remote thread fetching (#10106) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fetch up to 5 replies when discovering a new remote status This is used for resolving threads downwards. The originating server must add a “replies” attributes with such replies for it to be useful. * Add some tests for ActivityPub::FetchRepliesWorker * Add specs for ActivityPub::FetchRepliesService * Serialize up to 5 public self-replies for ActivityPub notes * Add specs for ActivityPub::NoteSerializer * Move exponential backoff logic to a worker concern * Fetch first page of paginated collections when fetching thread replies * Add specs for paginated collections in replies * Move Note replies serialization to a first CollectionPage The collection isn't actually paginable yet as it has no id nor a `next` field. This may come in another PR. * Use pluck(:uri) instead of map(&:uri) to improve performances * Fix fetching replies when they are in a CollectionPage --- app/lib/activitypub/activity/create.rb | 10 ++ .../concerns/status_threading_concern.rb | 4 + .../activitypub/collection_presenter.rb | 2 +- .../activitypub/collection_serializer.rb | 5 +- .../activitypub/note_serializer.rb | 13 ++ .../activitypub/fetch_replies_service.rb | 60 +++++++++ .../activitypub/fetch_replies_worker.rb | 12 ++ app/workers/concerns/exponential_backoff.rb | 11 ++ app/workers/fetch_reply_worker.rb | 12 ++ app/workers/thread_resolve_worker.rb | 5 +- spec/serializers/activitypub/note_spec.rb | 44 +++++++ .../activitypub/fetch_replies_service_spec.rb | 122 ++++++++++++++++++ .../activitypub/fetch_replies_worker_spec.rb | 40 ++++++ 13 files changed, 333 insertions(+), 7 deletions(-) create mode 100644 app/services/activitypub/fetch_replies_service.rb create mode 100644 app/workers/activitypub/fetch_replies_worker.rb create mode 100644 app/workers/concerns/exponential_backoff.rb create mode 100644 app/workers/fetch_reply_worker.rb create mode 100644 spec/serializers/activitypub/note_spec.rb create mode 100644 spec/services/activitypub/fetch_replies_service_spec.rb create mode 100644 spec/workers/activitypub/fetch_replies_worker_spec.rb diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index 6d58aba70..0980f94ba 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -40,6 +40,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity end resolve_thread(@status) + fetch_replies(@status) distribute(@status) forward_for_reply if @status.public_visibility? || @status.unlisted_visibility? end @@ -213,6 +214,15 @@ class ActivityPub::Activity::Create < ActivityPub::Activity ThreadResolveWorker.perform_async(status.id, in_reply_to_uri) end + def fetch_replies(status) + collection = @object['replies'] + return if collection.nil? + replies = ActivityPub::FetchRepliesService.new.call(status, collection, false) + return if replies.present? + uri = value_or_id(collection) + ActivityPub::FetchRepliesWorker.perform_async(status.id, uri) unless uri.nil? + end + def conversation_from_uri(uri) return nil if uri.nil? return Conversation.find_by(id: OStatus::TagManager.instance.unique_tag_to_local_id(uri, 'Conversation')) if OStatus::TagManager.instance.local_id?(uri) diff --git a/app/models/concerns/status_threading_concern.rb b/app/models/concerns/status_threading_concern.rb index b9c800c2a..15eb695cd 100644 --- a/app/models/concerns/status_threading_concern.rb +++ b/app/models/concerns/status_threading_concern.rb @@ -11,6 +11,10 @@ module StatusThreadingConcern find_statuses_from_tree_path(descendant_ids(limit, max_child_id, since_child_id, depth), account, promote: true) end + def self_replies(limit) + account.statuses.where(in_reply_to_id: id, visibility: [:public, :unlisted]).reorder(id: :asc).limit(limit) + end + private def ancestor_ids(limit) diff --git a/app/presenters/activitypub/collection_presenter.rb b/app/presenters/activitypub/collection_presenter.rb index ec84ab1a3..28331f0c4 100644 --- a/app/presenters/activitypub/collection_presenter.rb +++ b/app/presenters/activitypub/collection_presenter.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true class ActivityPub::CollectionPresenter < ActiveModelSerializers::Model - attributes :id, :type, :size, :items, :part_of, :first, :last, :next, :prev + attributes :id, :type, :size, :items, :page, :part_of, :first, :last, :next, :prev end diff --git a/app/serializers/activitypub/collection_serializer.rb b/app/serializers/activitypub/collection_serializer.rb index e8960131b..b03609957 100644 --- a/app/serializers/activitypub/collection_serializer.rb +++ b/app/serializers/activitypub/collection_serializer.rb @@ -7,7 +7,8 @@ class ActivityPub::CollectionSerializer < ActiveModel::Serializer super end - attributes :id, :type + attribute :id, if: -> { object.id.present? } + attribute :type attribute :total_items, if: -> { object.size.present? } attribute :next, if: -> { object.next.present? } attribute :prev, if: -> { object.prev.present? } @@ -37,6 +38,6 @@ class ActivityPub::CollectionSerializer < ActiveModel::Serializer end def page? - object.part_of.present? + object.part_of.present? || object.page.present? end end diff --git a/app/serializers/activitypub/note_serializer.rb b/app/serializers/activitypub/note_serializer.rb index c9d23e25f..6b0978ad3 100644 --- a/app/serializers/activitypub/note_serializer.rb +++ b/app/serializers/activitypub/note_serializer.rb @@ -13,6 +13,8 @@ class ActivityPub::NoteSerializer < ActiveModel::Serializer has_many :media_attachments, key: :attachment has_many :virtual_tags, key: :tag + has_one :replies, serializer: ActivityPub::CollectionSerializer + def id ActivityPub::TagManager.instance.uri_for(object) end @@ -33,6 +35,17 @@ class ActivityPub::NoteSerializer < ActiveModel::Serializer { object.language => Formatter.instance.format(object) } end + def replies + ActivityPub::CollectionPresenter.new( + type: :unordered, + first: ActivityPub::CollectionPresenter.new( + type: :unordered, + page: true, + items: object.self_replies(5).pluck(:uri) + ) + ) + end + def language? object.language.present? end diff --git a/app/services/activitypub/fetch_replies_service.rb b/app/services/activitypub/fetch_replies_service.rb new file mode 100644 index 000000000..95c486a43 --- /dev/null +++ b/app/services/activitypub/fetch_replies_service.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +class ActivityPub::FetchRepliesService < BaseService + include JsonLdHelper + + def call(parent_status, collection_or_uri, allow_synchronous_requests = true) + @account = parent_status.account + @allow_synchronous_requests = allow_synchronous_requests + + @items = collection_items(collection_or_uri) + return if @items.nil? + + FetchReplyWorker.push_bulk(filtered_replies) + + @items + end + + private + + def collection_items(collection_or_uri) + collection = fetch_collection(collection_or_uri) + return unless collection.is_a?(Hash) + + collection = fetch_collection(collection['first']) if collection['first'].present? + return unless collection.is_a?(Hash) + + case collection['type'] + when 'Collection', 'CollectionPage' + collection['items'] + when 'OrderedCollection', 'OrderedCollectionPage' + collection['orderedItems'] + end + end + + def fetch_collection(collection_or_uri) + return collection_or_uri if collection_or_uri.is_a?(Hash) + return unless @allow_synchronous_requests + return if invalid_origin?(collection_or_uri) + collection = fetch_resource_without_id_validation(collection_or_uri) + raise Mastodon::UnexpectedResponseError if collection.nil? + collection + end + + def filtered_replies + # Only fetch replies to the same server as the original status to avoid + # amplification attacks. + + # Also limit to 5 fetched replies to limit potential for DoS. + @items.map { |item| value_or_id(item) }.reject { |uri| invalid_origin?(uri) }.take(5) + end + + def invalid_origin?(url) + return true if unsupported_uri_scheme?(url) + + needle = Addressable::URI.parse(url).host + haystack = Addressable::URI.parse(@account.uri).host + + !haystack.casecmp(needle).zero? + end +end diff --git a/app/workers/activitypub/fetch_replies_worker.rb b/app/workers/activitypub/fetch_replies_worker.rb new file mode 100644 index 000000000..bf466db54 --- /dev/null +++ b/app/workers/activitypub/fetch_replies_worker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +class ActivityPub::FetchRepliesWorker + include Sidekiq::Worker + include ExponentialBackoff + + sidekiq_options queue: 'pull', retry: 3 + + def perform(parent_status_id, replies_uri) + ActivityPub::FetchRepliesService.new.call(Status.find(parent_status_id), replies_uri) + end +end diff --git a/app/workers/concerns/exponential_backoff.rb b/app/workers/concerns/exponential_backoff.rb new file mode 100644 index 000000000..f2b931e33 --- /dev/null +++ b/app/workers/concerns/exponential_backoff.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module ExponentialBackoff + extend ActiveSupport::Concern + + included do + sidekiq_retry_in do |count| + 15 + 10 * (count**4) + rand(10 * (count**4)) + end + end +end diff --git a/app/workers/fetch_reply_worker.rb b/app/workers/fetch_reply_worker.rb new file mode 100644 index 000000000..f7aa25e81 --- /dev/null +++ b/app/workers/fetch_reply_worker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +class FetchReplyWorker + include Sidekiq::Worker + include ExponentialBackoff + + sidekiq_options queue: 'pull', retry: 3 + + def perform(child_url) + FetchRemoteStatusService.new.call(child_url) + end +end diff --git a/app/workers/thread_resolve_worker.rb b/app/workers/thread_resolve_worker.rb index c18a778d5..8bba9ca75 100644 --- a/app/workers/thread_resolve_worker.rb +++ b/app/workers/thread_resolve_worker.rb @@ -2,13 +2,10 @@ class ThreadResolveWorker include Sidekiq::Worker + include ExponentialBackoff sidekiq_options queue: 'pull', retry: 3 - sidekiq_retry_in do |count| - 15 + 10 * (count**4) + rand(10 * (count**4)) - end - def perform(child_status_id, parent_url) child_status = Status.find(child_status_id) parent_status = FetchRemoteStatusService.new.call(parent_url) diff --git a/spec/serializers/activitypub/note_spec.rb b/spec/serializers/activitypub/note_spec.rb new file mode 100644 index 000000000..55bfbc16b --- /dev/null +++ b/spec/serializers/activitypub/note_spec.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe ActivityPub::NoteSerializer do + let!(:account) { Fabricate(:account) } + let!(:other) { Fabricate(:account) } + let!(:parent) { Fabricate(:status, account: account, visibility: :public) } + let!(:reply1) { Fabricate(:status, account: account, thread: parent, visibility: :public) } + let!(:reply2) { Fabricate(:status, account: account, thread: parent, visibility: :public) } + let!(:reply3) { Fabricate(:status, account: other, thread: parent, visibility: :public) } + let!(:reply4) { Fabricate(:status, account: account, thread: parent, visibility: :public) } + let!(:reply5) { Fabricate(:status, account: account, thread: parent, visibility: :direct) } + + before(:each) do + @serialization = ActiveModelSerializers::SerializableResource.new(parent, serializer: ActivityPub::NoteSerializer, adapter: ActivityPub::Adapter) + end + + subject { JSON.parse(@serialization.to_json) } + + it 'has a Note type' do + expect(subject['type']).to eql('Note') + end + + it 'has a replies collection' do + expect(subject['replies']['type']).to eql('Collection') + end + + it 'has a replies collection with a first Page' do + expect(subject['replies']['first']['type']).to eql('CollectionPage') + end + + it 'includes public self-replies in its replies collection' do + expect(subject['replies']['first']['items']).to include(reply1.uri, reply2.uri, reply4.uri) + end + + it 'does not include replies from others in its replies collection' do + expect(subject['replies']['first']['items']).to_not include(reply3.uri) + end + + it 'does not include replies with direct visibility in its replies collection' do + expect(subject['replies']['first']['items']).to_not include(reply5.uri) + end +end diff --git a/spec/services/activitypub/fetch_replies_service_spec.rb b/spec/services/activitypub/fetch_replies_service_spec.rb new file mode 100644 index 000000000..65c453341 --- /dev/null +++ b/spec/services/activitypub/fetch_replies_service_spec.rb @@ -0,0 +1,122 @@ +require 'rails_helper' + +RSpec.describe ActivityPub::FetchRepliesService, type: :service do + let(:actor) { Fabricate(:account, domain: 'example.com', uri: 'http://example.com/account') } + let(:status) { Fabricate(:status, account: actor) } + let(:collection_uri) { 'http://example.com/replies/1' } + + let(:items) do + [ + 'http://example.com/self-reply-1', + 'http://example.com/self-reply-2', + 'http://example.com/self-reply-3', + 'http://other.com/other-reply-1', + 'http://other.com/other-reply-2', + 'http://other.com/other-reply-3', + 'http://example.com/self-reply-4', + 'http://example.com/self-reply-5', + 'http://example.com/self-reply-6', + ] + end + + let(:payload) do + { + '@context': 'https://www.w3.org/ns/activitystreams', + type: 'Collection', + id: collection_uri, + items: items, + }.with_indifferent_access + end + + subject { described_class.new } + + describe '#call' do + context 'when the payload is a Collection with inlined replies' do + context 'when passing the collection itself' do + it 'spawns workers for up to 5 replies on the same server' do + allow(FetchReplyWorker).to receive(:push_bulk) + subject.call(status, payload) + expect(FetchReplyWorker).to have_received(:push_bulk).with(['http://example.com/self-reply-1', 'http://example.com/self-reply-2', 'http://example.com/self-reply-3', 'http://example.com/self-reply-4', 'http://example.com/self-reply-5']) + end + end + + context 'when passing the URL to the collection' do + before do + stub_request(:get, collection_uri).to_return(status: 200, body: Oj.dump(payload)) + end + + it 'spawns workers for up to 5 replies on the same server' do + allow(FetchReplyWorker).to receive(:push_bulk) + subject.call(status, collection_uri) + expect(FetchReplyWorker).to have_received(:push_bulk).with(['http://example.com/self-reply-1', 'http://example.com/self-reply-2', 'http://example.com/self-reply-3', 'http://example.com/self-reply-4', 'http://example.com/self-reply-5']) + end + end + end + + context 'when the payload is an OrderedCollection with inlined replies' do + let(:payload) do + { + '@context': 'https://www.w3.org/ns/activitystreams', + type: 'OrderedCollection', + id: collection_uri, + orderedItems: items, + }.with_indifferent_access + end + + context 'when passing the collection itself' do + it 'spawns workers for up to 5 replies on the same server' do + allow(FetchReplyWorker).to receive(:push_bulk) + subject.call(status, payload) + expect(FetchReplyWorker).to have_received(:push_bulk).with(['http://example.com/self-reply-1', 'http://example.com/self-reply-2', 'http://example.com/self-reply-3', 'http://example.com/self-reply-4', 'http://example.com/self-reply-5']) + end + end + + context 'when passing the URL to the collection' do + before do + stub_request(:get, collection_uri).to_return(status: 200, body: Oj.dump(payload)) + end + + it 'spawns workers for up to 5 replies on the same server' do + allow(FetchReplyWorker).to receive(:push_bulk) + subject.call(status, collection_uri) + expect(FetchReplyWorker).to have_received(:push_bulk).with(['http://example.com/self-reply-1', 'http://example.com/self-reply-2', 'http://example.com/self-reply-3', 'http://example.com/self-reply-4', 'http://example.com/self-reply-5']) + end + end + end + + context 'when the payload is a paginated Collection with inlined replies' do + let(:payload) do + { + '@context': 'https://www.w3.org/ns/activitystreams', + type: 'Collection', + id: collection_uri, + first: { + type: 'CollectionPage', + partOf: collection_uri, + items: items, + } + }.with_indifferent_access + end + + context 'when passing the collection itself' do + it 'spawns workers for up to 5 replies on the same server' do + allow(FetchReplyWorker).to receive(:push_bulk) + subject.call(status, payload) + expect(FetchReplyWorker).to have_received(:push_bulk).with(['http://example.com/self-reply-1', 'http://example.com/self-reply-2', 'http://example.com/self-reply-3', 'http://example.com/self-reply-4', 'http://example.com/self-reply-5']) + end + end + + context 'when passing the URL to the collection' do + before do + stub_request(:get, collection_uri).to_return(status: 200, body: Oj.dump(payload)) + end + + it 'spawns workers for up to 5 replies on the same server' do + allow(FetchReplyWorker).to receive(:push_bulk) + subject.call(status, collection_uri) + expect(FetchReplyWorker).to have_received(:push_bulk).with(['http://example.com/self-reply-1', 'http://example.com/self-reply-2', 'http://example.com/self-reply-3', 'http://example.com/self-reply-4', 'http://example.com/self-reply-5']) + end + end + end + end +end diff --git a/spec/workers/activitypub/fetch_replies_worker_spec.rb b/spec/workers/activitypub/fetch_replies_worker_spec.rb new file mode 100644 index 000000000..91ef3c4b9 --- /dev/null +++ b/spec/workers/activitypub/fetch_replies_worker_spec.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe ActivityPub::FetchRepliesWorker do + subject { described_class.new } + + let(:account) { Fabricate(:account, uri: 'https://example.com/user/1') } + let(:status) { Fabricate(:status, account: account) } + + let(:payload) do + { + '@context': 'https://www.w3.org/ns/activitystreams', + id: 'https://example.com/statuses_replies/1', + type: 'Collection', + items: [], + } + end + + let(:json) { Oj.dump(payload) } + + describe 'perform' do + it 'performs a request if the collection URI is from the same host' do + stub_request(:get, 'https://example.com/statuses_replies/1').to_return(status: 200, body: json) + subject.perform(status.id, 'https://example.com/statuses_replies/1') + expect(a_request(:get, 'https://example.com/statuses_replies/1')).to have_been_made.once + end + + it 'does not perform a request if the collection URI is from a different host' do + stub_request(:get, 'https://other.com/statuses_replies/1').to_return(status: 200) + subject.perform(status.id, 'https://other.com/statuses_replies/1') + expect(a_request(:get, 'https://other.com/statuses_replies/1')).to_not have_been_made + end + + it 'raises when request fails' do + stub_request(:get, 'https://example.com/statuses_replies/1').to_return(status: 500) + expect { subject.perform(status.id, 'https://example.com/statuses_replies/1') }.to raise_error Mastodon::UnexpectedResponseError + end + end +end