# File lib/rbot/core/utils/httputil.rb, line 32 def body_charset(str=self.raw_body) ctype = self['content-type'] || 'text/html' return nil unless ctype =~ /^text/ || ctype =~ /x(ht)?ml/ charsets = ['latin1'] # should be in config if ctype.match(/charset=["']?([^\s"']+)["']?/) charsets << $1 debug "charset #{charsets.last} added from header" end case str when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/ charsets << $1 debug "xml charset #{charsets.last} added from xml pi" when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/ meta = $1 if meta =~ /charset=['"]?([^\s'";]+)['"]?/ charsets << $1 debug "html charset #{charsets.last} added from meta" end end return charsets.uniq end
# File lib/rbot/core/utils/httputil.rb, line 57 def body_to_utf(str) charsets = self.body_charset(str) or return str charsets.reverse_each do |charset| # XXX: this one is really ugly, but i don't know how to make it better # -jsn 0.upto(5) do |off| begin debug "trying #{charset} / offset #{off}" return Iconv.iconv('utf-8//ignore', charset, str.slice(0 .. (-1 - off))).first rescue debug "conversion failed for #{charset} / offset #{off}" end end end return str end
# File lib/rbot/core/utils/httputil.rb, line 126 def cooked_body return self.body_to_utf(self.decompress_body(self.raw_body)) end
# File lib/rbot/core/utils/httputil.rb, line 78 def decompress_body(str) method = self['content-encoding'] case method when nil return str when /gzip/ # Matches gzip, x-gzip, and the non-rfc-compliant gzip;q=\d sent by some servers debug "gunzipping body" begin return Zlib::GzipReader.new(StringIO.new(str)).read rescue Zlib::Error => e # If we can't unpack the whole stream (e.g. because we're doing a # partial read debug "full gunzipping failed (#{e}), trying to recover as much as possible" ret = "" begin Zlib::GzipReader.new(StringIO.new(str)).each_byte { |byte| ret << byte } rescue end return ret end when 'deflate' debug "inflating body" # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread: # -MAX_WBITS stops zlib from looking for a zlib header inflater = Zlib::Inflate.new(-Zlib::MAX_WBITS) begin return inflater.inflate(str) rescue Zlib::Error => e raise e # TODO # debug "full inflation failed (#{e}), trying to recover as much as possible" end when /^(?:iso-8859-\d+|windows-\d+|utf-8|utf8)$/ # B0rked servers (Freshmeat being one of them) sometimes return the charset # in the content-encoding; in this case we assume that the document has # a standarc content-encoding old_hsh = self.to_hash self['content-type']= self['content-type']+"; charset="+method.downcase warning "Charset vs content-encoding confusion, trying to recover: from\n#{old_hsh.pretty_inspect}to\n#{self.to_hash.pretty_inspect}" return str else debug self.to_hash raise "Unhandled content encoding #{method}" end end
Read chunks from the body until we have at least size bytes, yielding the partial text at each chunk. Return the partial body.
# File lib/rbot/core/utils/httputil.rb, line 132 def partial_body(size=0, &block) partial = String.new if @read debug "using body() as partial" partial = self.body yield self.body_to_utf(self.decompress_body(partial)) if block_given? else debug "disabling cache" self.no_cache = true self.read_body { |chunk| partial << chunk yield self.body_to_utf(self.decompress_body(partial)) if block_given? break if size and size > 0 and partial.length >= size } end return self.body_to_utf(self.decompress_body(partial)) end
Generated with the Darkfish Rdoc Generator 2.