#!/usr/bin/env ruby
# encoding: UTF-8

# scrape-ff 2015.04.10.0938
# 2015.03.12 - initial attempt
# 2015.03.13 - also changed "pager top" links to point to local index pages
# 2015.03.14 - download last index page properly instead of hanging
# 2015.03.17 - changed directory structure so that main feed indices won't collide with comment feed indices
# 2015.03.18 - fixed relative paths for entries that don't have a slug
# 2015.03.23 - fixed case where there are only a total of two index pages
# 2015.03.26 - rewrite links that don't contain http:// into relative links into the file system
# 2015.04.03 - don't mangle https:// links; rewrite *all* links that don't contain http:// or https:// into relative links
# 2015.04.08 - fixed a bug that mangled URLs that contained the substring '/static/' in the middle of the URL and caused 404s
# 2015.04.09 - fixed a bug that mangled index URLs
# 2015.04.09.1301 - added error handling for 403s
# 2015.04.10.0938 - fixed a bug where parsing of http://i.friendfeed.com/ links failed

require 'nokogiri'
require 'open-uri'
require 'fileutils'

ffbaseurl = "http://friendfeed.com"
feedname = ARGV[0]
if feedname then
  FileUtils.mkdir_p(feedname) unless File.exist?(feedname)
  currentindexurl = [ffbaseurl, feedname].join('/')
  indexfilename = [feedname, "index.html"].join('/')
  subdirdepth = feedname.split('/').count
  indexcount = 0
  
  loop do
    @index = Nokogiri::HTML(open(currentindexurl))
      
    if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then
      nextindexrelpath = @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content
    end
    if @index.xpath('(//div[@class="pager bottom"]/a)[1]').any? then 
      if @index.xpath('(//div[@class="pager bottom"]/a)[1]').first.content == "Older items »" then
        nextindexrelpath = @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content
      end
    end
    puts "nextindexrelpath: " + nextindexrelpath.to_s
    puts "Downloading index page " + indexcount.to_s

    if indexcount == 0 then
      stylesheetrelpath = @index.xpath('//link[@rel="stylesheet"]/@href').first.content.gsub(/\?.*/,'')
      puts "stylesheetrelpath: " + stylesheetrelpath
      stylesheetlocalpath = ['.', stylesheetrelpath].join
      puts "stylesheetlocalpath: " + stylesheetlocalpath
      stylesheetdir = File.dirname(stylesheetlocalpath)
      puts "stylesheetdir: " + stylesheetdir
      stylesheeturl = [ffbaseurl, stylesheetrelpath].join
      puts "stylesheeturl: " + stylesheeturl
      FileUtils.mkdir_p(stylesheetdir) unless File.exist?(stylesheetdir)
      File.write(stylesheetlocalpath, open(stylesheeturl).read, {mode: 'wb'})
        
      @index.xpath('//script/@src').each { |script|
        scriptrelpath = script.content.gsub(/\?.*/,'')
        puts "scriptrelpath: " + scriptrelpath
        scriptlocalpath = ['.', scriptrelpath].join
        puts "scriptlocalpath: " + scriptlocalpath
        scriptdir = File.dirname(scriptlocalpath)
        puts "scriptdir: " + scriptdir
        scripturl = [ffbaseurl, scriptrelpath].join
        puts "scripturl: " + scripturl
        FileUtils.mkdir_p(scriptdir) unless File.exist?(scriptdir)
        File.write(scriptlocalpath, open(scripturl).read, {mode: 'wb'})
      }
    end
    
    if indexcount == 0 then 
      if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then
        @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content = "index1.html"
      else
        @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content = "index1.html"
      end
    elsif indexcount == 1 then
      @index.xpath('(//div[@class="pager top"]/a)[1]/@href').first.content = "index.html"
      @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content = "index.html"

      if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then
        @index.xpath('(//div[@class="pager top"]/a)[2]/@href').first.content = "index2.html"
        @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content = "index2.html"
      end
    else
      previndexcount = indexcount - 1
      nextindexcount = indexcount + 1
      @index.xpath('(//div[@class="pager top"]/a)[1]/@href').first.content = ["index",previndexcount.to_s, ".html"].join
      @index.xpath('(//div[@class="pager bottom"]/a)[1]/@href').first.content = ["index",previndexcount.to_s, ".html"].join
        
      if @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').any? then
        @index.xpath('(//div[@class="pager top"]/a)[2]/@href').first.content = ["index", nextindexcount.to_s, ".html"].join
        @index.xpath('(//div[@class="pager bottom"]/a)[2]/@href').first.content = ["index", nextindexcount.to_s, ".html"].join
      end
    end
      
    @index.xpath('//a[@class="date"]').each { |post|
      postrelpath = post.xpath('@href').text
      postrelpathsubdirdepth = postrelpath.split('/').count - 2
      postlocalpath = ['.', postrelpath].join
      posturl = [ffbaseurl, postrelpath].join
      postdir = File.dirname(postlocalpath)

      post.xpath('@href').first.content = ['../' * subdirdepth, postlocalpath].join
        
      FileUtils.mkdir_p(postdir) unless File.exist?(postdir)
      if File.exist?(postlocalpath) then 
        puts "Skipping " + postlocalpath
      else
        puts "Saving " + postlocalpath
        @postcontent = Nokogiri::HTML(open(posturl))
        
        stylesheetrelpath = @postcontent.xpath('//link[@rel="stylesheet"]/@href').first.content.gsub(/\?.*/,'')
        puts "stylesheetrelpath: " + stylesheetrelpath
        stylesheetlocalpath = ['.', stylesheetrelpath].join
        puts "stylesheetlocalpath: " + stylesheetlocalpath
        stylesheetdir = File.dirname(stylesheetlocalpath)
        puts "stylesheetdir: " + stylesheetdir
        stylesheeturl = [ffbaseurl, stylesheetrelpath].join
        puts "stylesheeturl: " + stylesheeturl
        FileUtils.mkdir_p(stylesheetdir) unless File.exist?(stylesheetdir)
        if not File.exist?(stylesheetlocalpath) then
          puts "downloading and saving" + stylesheetlocalpath
          File.write(stylesheetlocalpath, open(stylesheeturl).read, {mode: 'wb'})
        else
          puts stylesheetlocalpath + " exists, skipping"
        end
        
        @postcontent.xpath('//script/@src').each { |script|
          scriptrelpath = script.content.gsub(/\?.*/,'')
          puts "scriptrelpath: " + scriptrelpath
          scriptlocalpath = ['.', scriptrelpath].join
          puts "scriptlocalpath: " + scriptlocalpath
          scriptdir = File.dirname(scriptlocalpath)
          puts "scriptdir: " + scriptdir
          scripturl = [ffbaseurl, scriptrelpath].join
          puts "scripturl: " + scripturl
          FileUtils.mkdir_p(scriptdir) unless File.exist?(scriptdir)
          if not File.exist?(scriptlocalpath) then
            puts "downloading and saving" + scriptlocalpath
            File.write(scriptlocalpath, open(scripturl).read, {mode: 'wb'})
          else
            puts scriptlocalpath + " exists, skipping"
          end
        }
        puts "iterating through @postcontent.xpath('//img/@src')"
        @postcontent.xpath('//img/@src').each { |imgtag|
          if not imgtag.content.match('^/static/') and not imgtag.content.include? 'http://m.friendfeed-media.com/' and not imgtag.content.include? 'http://i.friendfeed.com/' then
            puts "Preserving link URL " + imgtag.content
          else
            if imgtag.content.match('^/static/') then 
              imgurl = [ffbaseurl, imgtag.content].join
              imgfn = imgtag.content.gsub('/static/','static/').gsub(/\?.*/,'')
            elsif imgtag.content.include? 'http://m.friendfeed-media.com/' then
              imgurl = imgtag.content
              imgfn = imgtag.content.gsub('http://m.friendfeed-media.com/', 'm.friendfeed-media.com/')
            elsif imgtag.content.include? 'http://i.friendfeed.com/' then
              imgurl = imgtag.content
              imgfn = imgtag.content.gsub('http://i.friendfeed.com/','i.friendfeed.com/')
            end
            puts "imgurl: " + imgurl
            puts "imgfn: " + imgfn
            imgdir = File.dirname(imgfn)
            FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir)
            if File.exist?(imgfn) then
              puts "Skipping " + imgfn
            else
              puts "Downloading " + imgurl
              puts "Saving " + imgfn
              begin
                File.write(imgfn, open(imgurl).read, {mode: 'wb'})
              rescue OpenURI::HTTPError => httperror
                puts httperror
              end
            end
            imgtag.content = ['../' * postrelpathsubdirdepth, imgfn].join
          end
        } 
        
        @postcontent.xpath('//a/@href').each { |ahref|
          if not ahref.content.include? 'http://' and not ahref.content.include? 'https://' then
            localurl = ['.','/..' * postrelpathsubdirdepth, ahref.content].join
            puts "rewriting " + ahref.content + " as " + localurl
            ahref.content = localurl
          end
        }
        
        
        @postcontent.xpath('//a[descendant::img]/@href').each { |ahref|
          if not ahref.content.include? 'http://m.friendfeed-media.com' and not ahref.content.include? 'http://i.friendfeed.com' then
            puts "Preserving link URL " + ahref.content
          else
            imgurl = ahref.content
            if ahref.content.include? 'http://m.friendfeed-media.com' then
              imgfn = ahref.content.gsub('http://m.friendfeed-media.com','m.friendfeed-media.com')
            end
            if ahref.content.include? 'http://i.friendfeed.com' then
              imgfn = ahref.content.gsub('http://i.friendfeed.com','i.friendfeed.com')
            end
            imgdir = File.dirname(imgfn)
            puts "imgurl: " + imgurl
            puts "imgfn: " + imgfn
            FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir)
            if File.exist?(imgfn) then
              puts "Skipping " + imgfn
            else
              puts "Downloading " + imgurl
              puts "Saving " + imgfn
              begin
                File.write(imgfn, open(imgurl).read, {mode: 'wb'})
              rescue OpenURI::HTTPError => httperror
                puts httperror
              end
            end
            ahref.content = ['../' * postrelpathsubdirdepth, imgfn].join
          end
        }

        @postcontent.xpath('//link[@rel="stylesheet"]/@href').first.content = 
        ['.', '/..' * postrelpathsubdirdepth, @postcontent.xpath('//link[@rel="stylesheet"]/@href').first.content].join
        @postcontent.xpath('//script/@src').each { |script|
          script.content = ['.', '/..' * postrelpathsubdirdepth, script.content].join
        }
        postfile = File.new(['.',postrelpath].join, "w")
        postfile.puts @postcontent.to_html
      end

    }
   

    @index.xpath('//img/@src').each { |imgtag|
      if not imgtag.content.match ('^/static/') and not imgtag.content.include? 'http://m.friendfeed-media.com/' and not imgtag.content.include? 'http://i.friendfeed.com/' then
        puts "Preserving link URL " + imgtag.content
      else
        if imgtag.content.match('^/static/') then 
          imgurl = [ffbaseurl, imgtag.content].join
          urlpath = '/static/'
          fspath = 'static/'
        elsif imgtag.content.include? 'http://m.friendfeed-media.com/' then
          imgurl = imgtag.content
          urlpath = 'http://m.friendfeed-media.com/'
          fspath = 'm.friendfeed-media.com/'
        elsif imgtag.content.include? 'http://i.friendfeed.com/' then
          imgurl = imgtag.content
          urlpath = 'http://i.friendfeed.com/'
          fspath = 'i.friendfeed.com/'
        end
        imgfn = imgtag.content.gsub(urlpath,fspath).gsub(/\?.*/,'')
        puts "imgurl: " + imgurl
        puts "imgfn: " + imgfn
        imgdir = File.dirname(imgfn)
        FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir)
        if File.exist?(imgfn) then
          puts "Skipping " + imgfn
        else
          puts "Downloading " + imgurl
          puts "Saving " + imgfn
          begin
            File.write(imgfn, open(imgurl).read, {mode: 'wb'})
          rescue OpenURI::HTTPError => httperror
            puts httperror
          end
        end
        imgtag.content = ['../' * subdirdepth, imgfn].join
        puts "imgtag.content: " + imgtag.content
      end
    } 

    @index.xpath('//a[@class!="date"]/@href').each { |ahref|
      if not ahref.content.include? 'http://' and not ahref.content.include? 'https://' and not ahref.content.match('^index.*\.html') then
        localurl = ['.', '/..' * subdirdepth, ahref.content].join
        puts "rewriting " + ahref.content + " as " + localurl
        ahref.content = localurl
      end
    }
    
    @index.xpath('//a[not(@class)]/@href').each { |ahref|
      if not ahref.content.include? 'http://' and not ahref.content.include? 'https://' and not ahref.content.match('^index.*\.html') then
        localurl = ['.', '/..' * subdirdepth, ahref.content].join
        puts "rewriting " + ahref.content + " as " + localurl
        ahref.content = localurl
      end
    }

    @index.xpath('//a[descendant::img]/@href').each { |ahref|
      if not ahref.content.include? 'http://m.friendfeed-media.com' and not ahref.content.include? 'http://i.friendfeed.com' then
        puts "Preserving link URL " + ahref.content
      else
        imgurl = ahref.content
        if ahref.content.include? 'http://m.friendfeed-media.com' then
          urlpath = 'http://m.friendfeed-media.com'
          fspath = 'm.friendfeed-media.com'
        end
        if ahref.content.include? 'http://i.friendfeed.com' then
          urlpath = 'http://m.friendfeed-media.com'
          fspath = 'i.friendfeed.com'
        end
        imgfn = ahref.content.gsub(urlpath, fspath)
        imgdir = File.dirname(imgfn)
        FileUtils.mkdir_p(imgdir) unless File.exist?(imgdir)
        if File.exist?(imgfn) then
          puts "Skipping " + imgfn
        else
          puts "Downloading " + imgurl
          puts "Saving " + imgfn
          begin
            File.write(imgfn, open(imgurl).read, {mode: 'wb'})
          rescue OpenURI::HTTPError => httperror
            puts httperror
          end
        end
        ahref.content = ['../' * subdirdepth, imgfn].join
        puts "ahref.content: " + ahref.content
      end
    }
    
    @index.xpath('//link[@rel="stylesheet"]/@href').first.content = 
    ['.', '/..' * subdirdepth, @index.xpath('//link[@rel="stylesheet"]/@href').first.content].join
    
    @index.xpath('//script/@src').each { |script|
      script.content = ['.','/..' * subdirdepth, script.content].join
    }

    indexfile = File.new(['./', indexfilename].join, "w")
    indexfile.puts @index.to_html

    indexcount += 1

    currentindexurl = [ffbaseurl, nextindexrelpath].join
    indexfilename = [feedname, '/', "index", indexcount.to_s, ".html"].join
    puts "currentindexurl: " + currentindexurl
    puts "indexfilename: " + indexfilename
    break if @index.xpath('(//div[@class="pager bottom"]/a)[2]').empty? and @index.xpath('(//div[@class="pager bottom"]/a)[1]').first.content != "Older items »"
  end 
else
  puts "Please specify a feed name: scrape-ff [feed name]"
  puts " "
  puts "e.g. scrape-ff aswang           # download feed and associated individual entries and images of user aswang"
  puts "     scrape-ff aswang/comments  # download feed of all entries that user aswang has commented on as well as individual entries and images"
end