Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clean up and export crc32c function #22274

Merged
merged 14 commits into from
Jun 13, 2017
Prev Previous commit
Next Next commit
restore crc32 of String, add crc32c(io) to read all of a stream, add …
…optimized open(crc32c, filename), make IOBuffer checksums consistent with other streams
  • Loading branch information
stevengj committed Jun 8, 2017
commit 38f6fc04c0fd932b0cfd8f575b0ef02b810e4368
42 changes: 26 additions & 16 deletions base/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -769,17 +769,22 @@ end
crc32c(data, crc::UInt32=0x00000000)

Compute the CRC-32c checksum of the given `data`, which can be
an `Array{UInt8}`, a contiguous subarray thereof, or an `IOBuffer`, or
a filename (whose contents will be checksummed). Optionally, you can pass
an `Array{UInt8}`, a contiguous subarray thereof, or a `String`. Optionally, you can pass
a starting `crc` integer to be mixed in with the checksum. The `crc` parameter
can be used to compute a checksum on data divided into chunks: performing
`crc32c(data2, crc32c(data1))` is equivalent to the checksum of `[data1; data2]`.
(Technically, a little-endian checksum is computed.)

To checksum `s::String`, you can do `crc32c(Vector{UInt8}(s))`; note
that the result is specific to the UTF-8 encoding of `String`. To checksum
an `a::Array` of some other bitstype, you can do `crc32c(reinterpret(UInt8,a))`;
note that the result is endian-dependent.
There is also a method `crc32c(io, nb, crc)` to checksum `nb` bytes from
a stream `io`, or `crc32c(io, crc)` to checksum all the remaining bytes.
Hence you can do [`open(crc32c, filename)`](@ref) to checksum an entire file,
or `crc32c(seekstart(buf))` to checksum an [`IOBuffer`](@ref) without
calling [`take!`](@ref).

For a `String`, note that the result is specific to the UTF-8 encoding
(a different checksum would be obtained from a different Unicode encoding).
To checksum an `a::Array` of some other bitstype, you can do `crc32c(reinterpret(UInt8,a))`,
but note that the result may be endian-dependent.
"""
function crc32c end

Expand All @@ -788,28 +793,33 @@ unsafe_crc32c(a, n, crc) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_
crc32c(a::Union{Array{UInt8},FastContiguousSubArray{UInt8,N,<:Array{UInt8}} where N}, crc::UInt32=0x00000000) =
unsafe_crc32c(a, length(a), crc)

crc32c(buf::IOBuffer, crc::UInt32=0x00000000) = unsafe_crc32c(buf.data, min(buf.size, length(buf.data)), crc)
crc32c(s::String, crc::UInt32=0x00000000) = unsafe_crc32c(s, sizeof(s), crc)

"""
crc32c(f::IO, nb::Integer, crc::UInt32=0x00000000)
crc32c(io::IO, [nb::Integer,] crc::UInt32=0x00000000)

Read up to `nb` bytes from `f` and return the CRC-32c checksum, optionally
mixed with a starting `crc` integer.
Read up to `nb` bytes from `io` and return the CRC-32c checksum, optionally
mixed with a starting `crc` integer. If `nb` is not supplied, then
`io` will be read until the end of the stream.
"""
function crc32c(f::IO, nb::Integer, crc::UInt32=0x00000000)
function crc32c(io::IO, nb::Integer, crc::UInt32=0x00000000)
nb < 0 && throw(ArgumentError("number of bytes to checksum must be ≥ 0"))
buf = Array{UInt8}(min(nb, 16384))
while !eof(f) && nb > 16384
n = readbytes!(f, buf)
while !eof(io) && nb > 16384
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be 8192 * 3 ? That's the LONG block size used in the sse4.2 version (and also on ARM in one of my up coming change)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. I tried 16384 and 32768 and the latter wasn't any faster on my machine, but 8192 * 3 is fine too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'll be catched by the short version so it won't matter too much but in principle 8192 * 2 and 8192 * 4 are equally bad since neither of them makes full use of the LONG loop.

n = readbytes!(io, buf)
crc = unsafe_crc32c(buf, n, crc)
nb -= n
end
eof(io) && return crc
@assert 0 ≤ nb ≤ length(buf)
return unsafe_crc32c(buf, readbytes!(f, buf, nb), crc)
return unsafe_crc32c(buf, readbytes!(io, buf, nb), crc)
end
crc32c(io::IO, crc::UInt32=0x00000000) = crc32c(io, typemax(Int64), crc)

crc32c(filename::AbstractString, crc::UInt32=0x00000000) =
# optimization for `open(crc, filename)` to use the size of the file
open(::typeof(crc32c), filename::AbstractString) =
open(filename, "r") do f
crc32c(f, filesize(f), crc)
crc32c(f, filesize(f))
end


Expand Down
14 changes: 9 additions & 5 deletions test/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ end
for force_software_crc in (1,0)
ccall(:jl_crc32c_init, Void, (Cint,), force_software_crc)
for (n,crc) in [(0,0x00000000),(1,0xa016d052),(2,0x03f89f52),(3,0xf130f21e),(4,0x29308cf4),(5,0x53518fab),(6,0x4f4dfbab),(7,0xbd3a64dc),(8,0x46891f81),(9,0x5a14b9f9),(10,0xb219db69),(11,0xd232a91f),(12,0x51a15563),(13,0x9f92de41),(14,0x4d8ae017),(15,0xc8b74611),(16,0xa0de6714),(17,0x672c992a),(18,0xe8206eb6),(19,0xc52fd285),(20,0x327b0397),(21,0x318263dd),(22,0x08485ccd),(23,0xea44d29e),(24,0xf6c0cb13),(25,0x3969bba2),(26,0x6a8810ec),(27,0x75b3d0df),(28,0x82d535b1),(29,0xbdf7fc12),(30,0x1f836b7d),(31,0xd29f33af),(32,0x8e4acb3e),(33,0x1cbee2d1),(34,0xb25f7132),(35,0xb0fa484c),(36,0xb9d262b4),(37,0x3207fe27),(38,0xa024d7ac),(39,0x49a2e7c5),(40,0x0e2c157f),(41,0x25f7427f),(42,0x368c6adc),(43,0x75efd4a5),(44,0xa84c5c31),(45,0x0fc817b2),(46,0x8d99a881),(47,0x5cc3c078),(48,0x9983d5e2),(49,0x9267c2db),(50,0xc96d4745),(51,0x058d8df3),(52,0x453f9cf3),(53,0xb714ade1),(54,0x55d3c2bc),(55,0x495710d0),(56,0x3bddf494),(57,0x4f2577d0),(58,0xdae0f604),(59,0x3c57c632),(60,0xfe39bbb0),(61,0x6f5d1d41),(62,0x7d996665),(63,0x68c738dc),(64,0x8dfea7ae)]
@test crc32c(UInt8[1:n;]) == crc
@test crc32c(UInt8[1:n;]) == crc == crc32c(String(UInt8[1:n;]))
end
# test that crc parameter is equivalent to checksum of concatenated data,
# and test crc of subarrays:
Expand All @@ -579,22 +579,26 @@ for force_software_crc in (1,0)
@test crc32c(IOBuffer(a)) == crc_256
let buf = IOBuffer()
write(buf, a[1:3])
@test crc32c(buf) == crc32c(a[1:3])
@test crc32c(seekstart(buf)) == crc32c(a[1:3])
@test crc32c(buf) == 0x00000000
@test crc32c(seek(buf, 1)) == crc32c(a[2:3])
@test crc32c(seek(buf, 0), 2) == crc32c(a[1:2])
@test crc32c(buf) == crc32c(a[3:3])
end

let f = tempname()
try
write(f, a)
@test crc32c(f) == crc_256
@test open(crc32c, f) == crc_256
open(f, "r") do io
@test crc32c(io, 16) == crc32c(a[1:16])
@test crc32c(io, 16) == crc32c(a[17:32])
@test crc32c(io, 1000) == crc32c(a[33:end])
@test crc32c(io) == crc32c(a[33:end])
@test crc32c(io, 1000) == 0x00000000
end
a = rand(UInt8, 30000)
write(f, a)
@test crc32c(f) == crc32c(a)
@test open(crc32c, f) == crc32c(a) == open(io -> crc32c(io, 10^6), f)
finally
rm(f, force=true)
end
Expand Down