並列処理で高速にS3のオブジェクトをダウンロードする

スレッドを使って並列ダウンロード。

#!/usr/bin/env ruby
require 'rubygems'
require 'base64'
require 'cgi'
require 'net/http'
require 'nokogiri'
require 'openssl'
require 'time'

Net::HTTP.version_1_2

def aws_sign(secret_access_key_id, date, bucket, path)
  string_to_sign = "GET\n\n\n#{date}\n/#{bucket}#{path}"
  digest = OpenSSL::HMAC.digest(OpenSSL::Digest::SHA1.new, secret_access_key_id, string_to_sign)
  Base64.encode64(digest).gsub("\n", '')
end

def gauge(bucket, date, path, access_key_id, secret_access_key_id)
  host = "#{bucket}.s3.amazonaws.com"
  signature = aws_sign(secret_access_key_id, date, bucket, '/')
  query = '/?prefix=' + CGI.escape(path.sub(%r|^/|, '')) + '&max-keys=1'

  header = {
    'Host' => host,
    'Date' => date,
    'Authorization' => "AWS #{access_key_id}:#{signature}"
  }

  content = nil

  Net::HTTP.start(host, 80) do |http|
    content = http.get(query, header).body
  end

  Nokogiri::HTML(content).at_css('size').content.to_i
end

def split(size, chunk_size)
  tail_start = 0
  n = size / chunk_size

  ranges = (0...n).map do |i|
    start = chunk_size * i
    tail_start = chunk_size * (i + 1)
    finish = tail_start - 1

    [i, start, finish]
  end

  unless (size % chunk_size).zero?
    ranges << [n, tail_start, size - 1]
  end

  return ranges
end

def get_multi(bucket, path, access_key_id, secret_access_key_id)
  host = "#{bucket}.s3.amazonaws.com"
  date = Time.now.rfc2822
  signature = aws_sign(secret_access_key_id, date, bucket, path)

  header = {
    'Host' => host,
    'Date' => date,
    'Authorization' => "AWS #{access_key_id}:#{signature}"
  }

  # calc renges
  size = gauge(bucket, date, path, access_key_id, secret_access_key_id)
  ranges = split(size, 1024 * 1024)

  chunks = []

  ranges.map {|i, start, finish|
    Thread.fork(header.dup) do |h|
      Net::HTTP.start(host, 80) do |http|
        h['Range'] = "bytes=#{start}-#{finish}"
        chunks[i] = http.get(path, h).body
      end
    end
  }.each {|t| t.join }

  chunks.inject("") {|r, i| r + i }
end

BucketName = 'my_bucket'
Path = '/path/to/object'
AWSAccessKeyId = '<MyAWSAccessKeyId>'
AWSSecretAccessKey = '<MyAWSSecretAccessKey>'

content = get_multi(BucketName, Path, AWSAccessKeyId, AWSSecretAccessKey)
puts(content.length / 1024 / 1024)

10MBのオブジェクトを普通にダウンロードすると…


~$ time ./s3get.rb
10

real 0m19.856s
user 0m0.046s
sys 0m0.015s

19秒かかるけど、並列にダウンロードすると…

~$ time ./s3getm.rb
10

real 0m11.855s
user 0m0.046s
sys 0m0.015s

11秒でダウンロードが終わる。

所感

  • 今回はWindowsでの検証。Linuxならもっと速いと思う
  • プロセスをフォークすればスループット上がりそう
    • オブジェクトの結合がめんどくさいけど
  • Net::HTTPはスレッドセーフではない感じ