#!/usr/bin/env ruby # encoding: UTF-8 # scrape-ff 2015.04.10.0938 # 2015.03.12 - initial attempt # 2015.03.13 - also changed "pager top" links to point to local index pages # 2015.03.14 - download last index page properly instead of hanging # 2015.03.17 - changed directory structure so that main feed indices won't collide with comment feed indices # 2015.03.18 - fixed relative paths for entries that don't have a slug # 2015.03.23 - fixed case where there are only a total of two index pages # 2015.03.26 - rewrite links that don't contain http:// into relative links into the file system # 2015.04.03 - don't mangle https:// links; rewrite *all* links that don't contain http:// or https:// into relative links # 2015.04.08 - fixed a bug that mangled URLs that contained the substring '/static/' in the middle of the URL and caused 404s # 2015.04.09 - fixed a bug that mangled index URLs # 2015.04.09.1301 - added error handling for 403s # 2015.04.10.0938 - fixed a bug where parsing of http://i.friendfeed.com/ links failed require 'nokogiri' require 'open-uri' require 'fileutils' ffbaseurl = "http://friendfeed.com" feedname = ARGV[0] if feedname then FileUtils.mkdir_p(feedname) unless File.exist?(feedname) currentindexurl = [ffbaseurl, feedname].join('/') indexfilename = [feedname, "index.html"].join('/') subdirdepth = feedname.split('/').count indexcount = 0 loop do @index = Nokogiri::HTML(open(currentindexurl)) if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then nextindexrelpath = @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content end if @index.xpath('(//div[@class="pager bottom"]/a)[1]').any? then if @index.xpath('(//div[@class="pager bottom"]/a)[1]').first.content == "Older items »" then nextindexrelpath = @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content end end puts "nextindexrelpath: " + nextindexrelpath.to_s puts "Downloading index page " + indexcount.to_s if indexcount == 0 then stylesheetrelpath = @index.xpath('//link[@rel="stylesheet"]/@href').first.content.gsub(/\?.*/,'') puts "stylesheetrelpath: " + stylesheetrelpath stylesheetlocalpath = ['.', stylesheetrelpath].join puts "stylesheetlocalpath: " + stylesheetlocalpath stylesheetdir = File.dirname(stylesheetlocalpath) puts "stylesheetdir: " + stylesheetdir stylesheeturl = [ffbaseurl, stylesheetrelpath].join puts "stylesheeturl: " + stylesheeturl FileUtils.mkdir_p(stylesheetdir) unless File.exist?(stylesheetdir) File.write(stylesheetlocalpath, open(stylesheeturl).read, {mode: 'wb'}) @index.xpath('//script/@src').each { |script| scriptrelpath = script.content.gsub(/\?.*/,'') puts "scriptrelpath: " + scriptrelpath scriptlocalpath = ['.', scriptrelpath].join puts "scriptlocalpath: " + scriptlocalpath scriptdir = File.dirname(scriptlocalpath) puts "scriptdir: " + scriptdir scripturl = [ffbaseurl, scriptrelpath].join puts "scripturl: " + scripturl FileUtils.mkdir_p(scriptdir) unless File.exist?(scriptdir) File.write(scriptlocalpath, open(scripturl).read, {mode: 'wb'}) } end if indexcount == 0 then if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content = "index1.html" else @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content = "index1.html" end elsif indexcount == 1 then @index.xpath('(//div[@class="pager top"]/a)[1]/@href').first.content = "index.html" @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content = "index.html" if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then @index.xpath('(//div[@class="pager top"]/a)[2]/@href').first.content = "index2.html" @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content = "index2.html" end else previndexcount = indexcount - 1 nextindexcount = indexcount + 1 @index.xpath('(//div[@class="pager top"]/a)[1]/@href').first.content = ["index",previndexcount.to_s, ".html"].join @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content = ["index",previndexcount.to_s, ".html"].join if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then @index.xpath('(//div[@class="pager top"]/a)[2]/@href').first.content = ["index", nextindexcount.to_s, ".html"].join @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content = ["index", nextindexcount.to_s, ".html"].join end end @index.xpath('//a[@class="date"]').each { |post| postrelpath = post.xpath('@href').text postrelpathsubdirdepth = postrelpath.split('/').count - 2 postlocalpath = ['.', postrelpath].join posturl = [ffbaseurl, postrelpath].join postdir = File.dirname(postlocalpath) post.xpath('@href').first.content = ['../' * subdirdepth, postlocalpath].join FileUtils.mkdir_p(postdir) unless File.exist?(postdir) if File.exist?(postlocalpath) then puts "Skipping " + postlocalpath else puts "Saving " + postlocalpath @postcontent = Nokogiri::HTML(open(posturl)) stylesheetrelpath = @postcontent.xpath('//link[@rel="stylesheet"]/@href').first.content.gsub(/\?.*/,'') puts "stylesheetrelpath: " + stylesheetrelpath stylesheetlocalpath = ['.', stylesheetrelpath].join puts "stylesheetlocalpath: " + stylesheetlocalpath stylesheetdir = File.dirname(stylesheetlocalpath) puts "stylesheetdir: " + stylesheetdir stylesheeturl = [ffbaseurl, stylesheetrelpath].join puts "stylesheeturl: " + stylesheeturl FileUtils.mkdir_p(stylesheetdir) unless File.exist?(stylesheetdir) if not File.exist?(stylesheetlocalpath) then puts "downloading and saving" + stylesheetlocalpath File.write(stylesheetlocalpath, open(stylesheeturl).read, {mode: 'wb'}) else puts stylesheetlocalpath + " exists, skipping" end @postcontent.xpath('//script/@src').each { |script| scriptrelpath = script.content.gsub(/\?.*/,'') puts "scriptrelpath: " + scriptrelpath scriptlocalpath = ['.', scriptrelpath].join puts "scriptlocalpath: " + scriptlocalpath scriptdir = File.dirname(scriptlocalpath) puts "scriptdir: " + scriptdir scripturl = [ffbaseurl, scriptrelpath].join puts "scripturl: " + scripturl FileUtils.mkdir_p(scriptdir) unless File.exist?(scriptdir) if not File.exist?(scriptlocalpath) then puts "downloading and saving" + scriptlocalpath File.write(scriptlocalpath, open(scripturl).read, {mode: 'wb'}) else puts scriptlocalpath + " exists, skipping" end } puts "iterating through @postcontent.xpath('//img/@src')" @postcontent.xpath('//img/@src').each { |imgtag| if not imgtag.content.match('^/static/') and not imgtag.content.include? 'http://m.friendfeed-media.com/' and not imgtag.content.include? 'http://i.friendfeed.com/' then puts "Preserving link URL " + imgtag.content else if imgtag.content.match('^/static/') then imgurl = [ffbaseurl, imgtag.content].join imgfn = imgtag.content.gsub('/static/','static/').gsub(/\?.*/,'') elsif imgtag.content.include? 'http://m.friendfeed-media.com/' then imgurl = imgtag.content imgfn = imgtag.content.gsub('http://m.friendfeed-media.com/', 'm.friendfeed-media.com/') elsif imgtag.content.include? 'http://i.friendfeed.com/' then imgurl = imgtag.content imgfn = imgtag.content.gsub('http://i.friendfeed.com/','i.friendfeed.com/') end puts "imgurl: " + imgurl puts "imgfn: " + imgfn imgdir = File.dirname(imgfn) FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir) if File.exist?(imgfn) then puts "Skipping " + imgfn else puts "Downloading " + imgurl puts "Saving " + imgfn begin File.write(imgfn, open(imgurl).read, {mode: 'wb'}) rescue OpenURI::HTTPError => httperror puts httperror end end imgtag.content = ['../' * postrelpathsubdirdepth, imgfn].join end } @postcontent.xpath('//a/@href').each { |ahref| if not ahref.content.include? 'http://' and not ahref.content.include? 'https://' then localurl = ['.','/..' * postrelpathsubdirdepth, ahref.content].join puts "rewriting " + ahref.content + " as " + localurl ahref.content = localurl end } @postcontent.xpath('//a[descendant::img]/@href').each { |ahref| if not ahref.content.include? 'http://m.friendfeed-media.com' and not ahref.content.include? 'http://i.friendfeed.com' then puts "Preserving link URL " + ahref.content else imgurl = ahref.content if ahref.content.include? 'http://m.friendfeed-media.com' then imgfn = ahref.content.gsub('http://m.friendfeed-media.com','m.friendfeed-media.com') end if ahref.content.include? 'http://i.friendfeed.com' then imgfn = ahref.content.gsub('http://i.friendfeed.com','i.friendfeed.com') end imgdir = File.dirname(imgfn) puts "imgurl: " + imgurl puts "imgfn: " + imgfn FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir) if File.exist?(imgfn) then puts "Skipping " + imgfn else puts "Downloading " + imgurl puts "Saving " + imgfn begin File.write(imgfn, open(imgurl).read, {mode: 'wb'}) rescue OpenURI::HTTPError => httperror puts httperror end end ahref.content = ['../' * postrelpathsubdirdepth, imgfn].join end } @postcontent.xpath('//link[@rel="stylesheet"]/@href').first.content = ['.', '/..' * postrelpathsubdirdepth, @postcontent.xpath('//link[@rel="stylesheet"]/@href').first.content].join @postcontent.xpath('//script/@src').each { |script| script.content = ['.', '/..' * postrelpathsubdirdepth, script.content].join } postfile = File.new(['.',postrelpath].join, "w") postfile.puts @postcontent.to_html end } @index.xpath('//img/@src').each { |imgtag| if not imgtag.content.match ('^/static/') and not imgtag.content.include? 'http://m.friendfeed-media.com/' and not imgtag.content.include? 'http://i.friendfeed.com/' then puts "Preserving link URL " + imgtag.content else if imgtag.content.match('^/static/') then imgurl = [ffbaseurl, imgtag.content].join urlpath = '/static/' fspath = 'static/' elsif imgtag.content.include? 'http://m.friendfeed-media.com/' then imgurl = imgtag.content urlpath = 'http://m.friendfeed-media.com/' fspath = 'm.friendfeed-media.com/' elsif imgtag.content.include? 'http://i.friendfeed.com/' then imgurl = imgtag.content urlpath = 'http://i.friendfeed.com/' fspath = 'i.friendfeed.com/' end imgfn = imgtag.content.gsub(urlpath,fspath).gsub(/\?.*/,'') puts "imgurl: " + imgurl puts "imgfn: " + imgfn imgdir = File.dirname(imgfn) FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir) if File.exist?(imgfn) then puts "Skipping " + imgfn else puts "Downloading " + imgurl puts "Saving " + imgfn begin File.write(imgfn, open(imgurl).read, {mode: 'wb'}) rescue OpenURI::HTTPError => httperror puts httperror end end imgtag.content = ['../' * subdirdepth, imgfn].join puts "imgtag.content: " + imgtag.content end } @index.xpath('//a[@class!="date"]/@href').each { |ahref| if not ahref.content.include? 'http://' and not ahref.content.include? 'https://' and not ahref.content.match('^index.*\.html') then localurl = ['.', '/..' * subdirdepth, ahref.content].join puts "rewriting " + ahref.content + " as " + localurl ahref.content = localurl end } @index.xpath('//a[not(@class)]/@href').each { |ahref| if not ahref.content.include? 'http://' and not ahref.content.include? 'https://' and not ahref.content.match('^index.*\.html') then localurl = ['.', '/..' * subdirdepth, ahref.content].join puts "rewriting " + ahref.content + " as " + localurl ahref.content = localurl end } @index.xpath('//a[descendant::img]/@href').each { |ahref| if not ahref.content.include? 'http://m.friendfeed-media.com' and not ahref.content.include? 'http://i.friendfeed.com' then puts "Preserving link URL " + ahref.content else imgurl = ahref.content if ahref.content.include? 'http://m.friendfeed-media.com' then urlpath = 'http://m.friendfeed-media.com' fspath = 'm.friendfeed-media.com' end if ahref.content.include? 'http://i.friendfeed.com' then urlpath = 'http://m.friendfeed-media.com' fspath = 'i.friendfeed.com' end imgfn = ahref.content.gsub(urlpath, fspath) imgdir = File.dirname(imgfn) FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir) if File.exist?(imgfn) then puts "Skipping " + imgfn else puts "Downloading " + imgurl puts "Saving " + imgfn begin File.write(imgfn, open(imgurl).read, {mode: 'wb'}) rescue OpenURI::HTTPError => httperror puts httperror end end ahref.content = ['../' * subdirdepth, imgfn].join puts "ahref.content: " + ahref.content end } @index.xpath('//link[@rel="stylesheet"]/@href').first.content = ['.', '/..' * subdirdepth, @index.xpath('//link[@rel="stylesheet"]/@href').first.content].join @index.xpath('//script/@src').each { |script| script.content = ['.','/..' * subdirdepth, script.content].join } indexfile = File.new(['./', indexfilename].join, "w") indexfile.puts @index.to_html indexcount += 1 currentindexurl = [ffbaseurl, nextindexrelpath].join indexfilename = [feedname, '/', "index", indexcount.to_s, ".html"].join puts "currentindexurl: " + currentindexurl puts "indexfilename: " + indexfilename break if @index.xpath('(//div[@class="pager bottom"]/a)[2]').empty? and @index.xpath('(//div[@class="pager bottom"]/a)[1]').first.content != "Older items »" end else puts "Please specify a feed name: scrape-ff [feed name]" puts " " puts "e.g. scrape-ff aswang # download feed and associated individual entries and images of user aswang" puts " scrape-ff aswang/comments # download feed of all entries that user aswang has commented on as well as individual entries and images" end