39 lines
834 B
Ruby
39 lines
834 B
Ruby
class PageCrawlerService
|
|
attr_reader :external_link
|
|
|
|
def initialize(external_link)
|
|
@external_link = external_link
|
|
@doc = Nokogiri::HTML(HTTParty.get(external_link).body)
|
|
end
|
|
|
|
def page_links
|
|
sitemap? ? extract_links_from_sitemap : extract_links_from_html
|
|
end
|
|
|
|
def page_title
|
|
title_element = @doc.at_xpath('//title')
|
|
title_element&.text&.strip
|
|
end
|
|
|
|
def body_text_content
|
|
ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true
|
|
end
|
|
|
|
private
|
|
|
|
def sitemap?
|
|
@external_link.end_with?('.xml')
|
|
end
|
|
|
|
def extract_links_from_sitemap
|
|
@doc.xpath('//loc').to_set(&:text)
|
|
end
|
|
|
|
def extract_links_from_html
|
|
@doc.xpath('//a/@href').to_set do |link|
|
|
absolute_url = URI.join(@external_link, link.value).to_s
|
|
absolute_url
|
|
end
|
|
end
|
|
end
|