diff --git a/blake3.go b/blake3.go
index 5f5c917..53293ca 100644
--- a/blake3.go
+++ b/blake3.go
@@ -10,40 +10,38 @@ import (
 )
 
 const (
-	OUT_LEN   = 32
-	KEY_LEN   = 32
-	BLOCK_LEN = 64
-	CHUNK_LEN = 1024
-
-	CHUNK_START         = 1 << 0
-	CHUNK_END           = 1 << 1
-	PARENT              = 1 << 2
-	ROOT                = 1 << 3
-	KEYED_HASH          = 1 << 4
-	DERIVE_KEY_CONTEXT  = 1 << 5
-	DERIVE_KEY_MATERIAL = 1 << 6
+	blockLen = 64
+	chunkLen = 1024
 )
 
-var IV = [8]uint32{
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
+// flags
+const (
+	flagChunkStart = 1 << iota
+	flagChunkEnd
+	flagParent
+	flagRoot
+	flagKeyedHash
+	flagDeriveKeyContext
+	flagDeriveKeyMaterial
+)
+
+var iv = [8]uint32{
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
 }
 
-var MSG_PERMUTATION = [16]uint{2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}
-
-func rotate_right(x uint32, n int) uint32 {
-	return (x >> n) | (x << (32 - n))
-}
-
-// The mixing function, G, which mixes either a column or a diagonal.
 func g(state *[16]uint32, a, b, c, d int, mx, my uint32) {
+	rotr := func(x uint32, n int) uint32 {
+		return (x >> n) | (x << (32 - n))
+	}
 	state[a] = state[a] + state[b] + mx
-	state[d] = rotate_right(state[d]^state[a], 16)
+	state[d] = rotr(state[d]^state[a], 16)
 	state[c] = state[c] + state[d]
-	state[b] = rotate_right(state[b]^state[c], 12)
+	state[b] = rotr(state[b]^state[c], 12)
 	state[a] = state[a] + state[b] + my
-	state[d] = rotate_right(state[d]^state[a], 8)
+	state[d] = rotr(state[d]^state[a], 8)
 	state[c] = state[c] + state[d]
-	state[b] = rotate_right(state[b]^state[c], 7)
+	state[b] = rotr(state[b]^state[c], 7)
 }
 
 func round(state, m *[16]uint32) {
@@ -60,33 +58,20 @@ func round(state, m *[16]uint32) {
 }
 
 func permute(m *[16]uint32) {
-	var permuted [16]uint32
+	permuted := [16]uint32{2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}
 	for i := range permuted {
-		permuted[i] = m[MSG_PERMUTATION[i]]
+		permuted[i] = m[permuted[i]]
 	}
 	*m = permuted
 }
 
-func compress(chaining_value *[8]uint32, block_words *[16]uint32, counter uint64, block_len uint32, flags uint32) [16]uint32 {
+func compress(cv [8]uint32, block [16]uint32, counter uint64, blockLen uint32, flags uint32) [16]uint32 {
 	state := [16]uint32{
-		chaining_value[0],
-		chaining_value[1],
-		chaining_value[2],
-		chaining_value[3],
-		chaining_value[4],
-		chaining_value[5],
-		chaining_value[6],
-		chaining_value[7],
-		IV[0],
-		IV[1],
-		IV[2],
-		IV[3],
-		uint32(counter),
-		uint32(counter >> 32),
-		block_len,
-		flags,
+		cv[0], cv[1], cv[2], cv[3],
+		cv[4], cv[5], cv[6], cv[7],
+		iv[0], iv[1], iv[2], iv[3],
+		uint32(counter), uint32(counter >> 32), blockLen, flags,
 	}
-	block := *block_words
 
 	round(&state, &block) // round 1
 	permute(&block)
@@ -102,52 +87,52 @@ func compress(chaining_value *[8]uint32, block_words *[16]uint32, counter uint64
 	permute(&block)
 	round(&state, &block) // round 7
 
-	for i := range chaining_value {
+	for i := range cv {
 		state[i] ^= state[i+8]
-		state[i+8] ^= chaining_value[i]
+		state[i+8] ^= cv[i]
 	}
 	return state
 }
 
-func first_8_words(compression_output [16]uint32) (out [8]uint32) {
-	copy(out[:], compression_output[:8])
+func first8(words [16]uint32) (out [8]uint32) {
+	copy(out[:], words[:8])
 	return
 }
 
-func words_from_litte_endian_bytes(bytes []byte, words []uint32) {
+func bytesToWords(bytes []byte, words []uint32) {
 	for i := 0; i < len(bytes); i += 4 {
 		words[i/4] = binary.LittleEndian.Uint32(bytes[i:])
 	}
 }
 
-// Each chunk or parent node can produce either an 8-word chaining value or, by
-// setting the ROOT flag, any number of final output bytes. The output struct
-// captures the state just prior to choosing between those two possibilities.
-type output struct {
-	input_chaining_value [8]uint32
-	block_words          [16]uint32
-	counter              uint64
-	block_len            uint32
-	flags                uint32
+func wordsToBlock(words []uint32, bytes []byte) {
+	for i, w := range words {
+		binary.LittleEndian.PutUint32(bytes[i*4:], w)
+	}
 }
 
-func (o *output) chaining_value() [8]uint32 {
-	return first_8_words(compress(
-		&o.input_chaining_value,
-		&o.block_words,
-		o.counter,
-		o.block_len,
-		o.flags,
-	))
+// Each chunk or parent node can produce either an 8-word chaining value or, by
+// setting flagRoot, any number of final output bytes. The output struct
+// captures the state just prior to choosing between those two possibilities.
+type output struct {
+	inChain    [8]uint32
+	blockWords [16]uint32
+	counter    uint64
+	blockLen   uint32
+	flags      uint32
+}
+
+func (o *output) chainingValue() [8]uint32 {
+	return first8(compress(o.inChain, o.blockWords, o.counter, o.blockLen, o.flags))
 }
 
 // An OutputReader produces an unbounded stream of output from its initial
 // state.
 type OutputReader struct {
-	o             *output
-	block         [BLOCK_LEN]byte
-	remaining     int
-	blocks_output uint64
+	o            *output
+	block        [blockLen]byte
+	remaining    int
+	blocksoutput uint64
 }
 
 // Read implements io.Reader. Read always return len(p), nil.
@@ -156,21 +141,19 @@ func (or *OutputReader) Read(p []byte) (int, error) {
 	for len(p) > 0 {
 		if or.remaining == 0 {
 			words := compress(
-				&or.o.input_chaining_value,
-				&or.o.block_words,
-				or.blocks_output,
-				or.o.block_len,
-				or.o.flags|ROOT,
+				or.o.inChain,
+				or.o.blockWords,
+				or.blocksoutput,
+				or.o.blockLen,
+				or.o.flags|flagRoot,
 			)
-			for i, w := range words {
-				binary.LittleEndian.PutUint32(or.block[i*4:], w)
-			}
-			or.remaining = BLOCK_LEN
-			or.blocks_output++
+			wordsToBlock(words[:], or.block[:])
+			or.remaining = blockLen
+			or.blocksoutput++
 		}
 
 		// copy from output buffer
-		n := copy(p, or.block[BLOCK_LEN-or.remaining:])
+		n := copy(p, or.block[blockLen-or.remaining:])
 		or.remaining -= n
 		p = p[n:]
 	}
@@ -178,104 +161,92 @@ func (or *OutputReader) Read(p []byte) (int, error) {
 }
 
 type chunkState struct {
-	chaining_value    [8]uint32
-	chunk_counter     uint64
-	block             [BLOCK_LEN]byte
-	block_len         byte
-	blocks_compressed byte
-	flags             uint32
-}
-
-func (cs *chunkState) len() int {
-	return BLOCK_LEN*int(cs.blocks_compressed) + int(cs.block_len)
-}
-
-func (cs *chunkState) start_flag() uint32 {
-	if cs.blocks_compressed == 0 {
-		return CHUNK_START
-	}
-	return 0
+	chainingValue [8]uint32
+	chunkCounter  uint64
+	block         [blockLen]byte
+	blockLen      int
+	bytesConsumed int
+	flags         uint32
 }
 
 func (cs *chunkState) update(input []byte) {
 	for len(input) > 0 {
 		// If the block buffer is full, compress it and clear it. More
-		// input is coming, so this compression is not CHUNK_END.
-		if cs.block_len == BLOCK_LEN {
-			var block_words [16]uint32
-			words_from_litte_endian_bytes(cs.block[:], block_words[:])
-			cs.chaining_value = first_8_words(compress(
-				&cs.chaining_value,
-				&block_words,
-				cs.chunk_counter,
-				BLOCK_LEN,
-				cs.flags|cs.start_flag(),
+		// input is coming, so this compression is not flagChunkEnd.
+		if cs.blockLen == blockLen {
+			var blockWords [16]uint32
+			bytesToWords(cs.block[:], blockWords[:])
+			cs.chainingValue = first8(compress(
+				cs.chainingValue,
+				blockWords,
+				cs.chunkCounter,
+				blockLen,
+				cs.flags,
 			))
-			cs.blocks_compressed++
-			cs.block = [BLOCK_LEN]byte{}
-			cs.block_len = 0
+			cs.block = [blockLen]byte{}
+			cs.blockLen = 0
+			// After the first chunk has been compressed, clear the start flag.
+			cs.flags &^= flagChunkStart
 		}
 
 		// Copy input bytes into the block buffer.
-		n := copy(cs.block[cs.block_len:], input)
-		cs.block_len += byte(n)
+		n := copy(cs.block[cs.blockLen:], input)
+		cs.blockLen += n
+		cs.bytesConsumed += n
 		input = input[n:]
 	}
 }
 
 func (cs *chunkState) output() *output {
-	var block_words [16]uint32
-	words_from_litte_endian_bytes(cs.block[:], block_words[:])
+	var blockWords [16]uint32
+	bytesToWords(cs.block[:], blockWords[:])
 	return &output{
-		input_chaining_value: cs.chaining_value,
-		block_words:          block_words,
-		block_len:            uint32(cs.block_len),
-		counter:              cs.chunk_counter,
-		flags:                cs.flags | cs.start_flag() | CHUNK_END,
+		inChain:    cs.chainingValue,
+		blockWords: blockWords,
+		blockLen:   uint32(cs.blockLen),
+		counter:    cs.chunkCounter,
+		flags:      cs.flags | flagChunkEnd,
 	}
 }
 
-func newChunkState(key [8]uint32, chunk_counter uint64, flags uint32) chunkState {
+func newChunkState(key [8]uint32, chunkCounter uint64, flags uint32) chunkState {
 	return chunkState{
-		chaining_value: key,
-		chunk_counter:  chunk_counter,
-		flags:          flags,
+		chainingValue: key,
+		chunkCounter:  chunkCounter,
+		// compress the first chunk with the start flag set
+		flags: flags | flagChunkStart,
 	}
 }
 
-func parent_output(left_child_cv [8]uint32, right_child_cv [8]uint32, key [8]uint32, flags uint32) *output {
-	var block_words [16]uint32
-	copy(block_words[:8], left_child_cv[:])
-	copy(block_words[8:], right_child_cv[:])
+func parentOutput(left, right [8]uint32, key [8]uint32, flags uint32) *output {
+	var blockWords [16]uint32
+	copy(blockWords[:8], left[:])
+	copy(blockWords[8:], right[:])
 	return &output{
-		input_chaining_value: key,
-		block_words:          block_words,
-		counter:              0,         // Always 0 for parent nodes.
-		block_len:            BLOCK_LEN, // Always BLOCK_LEN (64) for parent nodes.
-		flags:                PARENT | flags,
+		inChain:    key,
+		blockWords: blockWords,
+		counter:    0,        // Always 0 for parent nodes.
+		blockLen:   blockLen, // Always blockLen (64) for parent nodes.
+		flags:      flagParent | flags,
 	}
 }
 
-func parent_cv(left_child_cv [8]uint32, right_child_cv [8]uint32, key [8]uint32, flags uint32) [8]uint32 {
-	return parent_output(left_child_cv, right_child_cv, key, flags).chaining_value()
-}
-
 // Hasher implements hash.Hash.
 type Hasher struct {
-	chunk_state  chunkState
-	key          [8]uint32
-	cv_stack     [54][8]uint32 // Space for 54 subtree chaining values:
-	cv_stack_len byte          // 2^54 * CHUNK_LEN = 2^64
-	flags        uint32
-	out_size     int
+	cs         chunkState
+	key        [8]uint32
+	chainStack [54][8]uint32 // space for 54 subtrees (2^54 * chunkLen = 2^64)
+	stackSize  int           // index within chainStack
+	flags      uint32
+	size       int // output size, for Sum
 }
 
-func newHasher(key [8]uint32, flags uint32, out_size int) *Hasher {
+func newHasher(key [8]uint32, flags uint32, size int) *Hasher {
 	return &Hasher{
-		chunk_state: newChunkState(key, 0, flags),
-		key:         key,
-		flags:       flags,
-		out_size:    out_size,
+		cs:    newChunkState(key, 0, flags),
+		key:   key,
+		flags: flags,
+		size:  size,
 	}
 }
 
@@ -283,90 +254,89 @@ func newHasher(key [8]uint32, flags uint32, out_size int) *Hasher {
 // is unkeyed.
 func New(size int, key []byte) *Hasher {
 	if key == nil {
-		return newHasher(IV, 0, size)
+		return newHasher(iv, 0, size)
 	}
-	var key_words [8]uint32
-	words_from_litte_endian_bytes(key[:], key_words[:])
-	return newHasher(key_words, KEYED_HASH, size)
+	var keyWords [8]uint32
+	bytesToWords(key[:], keyWords[:])
+	return newHasher(keyWords, flagKeyedHash, size)
 }
 
 // NewFromDerivedKey returns a Hasher whose key was derived from the supplied
 // context string.
 func NewFromDerivedKey(size int, ctx string) *Hasher {
-	h := newHasher(IV, DERIVE_KEY_CONTEXT, KEY_LEN)
+	const (
+		derivedKeyLen = 32
+	)
+	h := newHasher(iv, flagDeriveKeyContext, derivedKeyLen)
 	h.Write([]byte(ctx))
 	key := h.Sum(nil)
-	var key_words [8]uint32
-	words_from_litte_endian_bytes(key, key_words[:])
-	return newHasher(key_words, DERIVE_KEY_MATERIAL, size)
+	var keyWords [8]uint32
+	bytesToWords(key, keyWords[:])
+	return newHasher(keyWords, flagDeriveKeyMaterial, size)
 }
 
-func (h *Hasher) push_stack(cv [8]uint32) {
-	h.cv_stack[h.cv_stack_len] = cv
-	h.cv_stack_len++
-}
-
-func (h *Hasher) pop_stack() [8]uint32 {
-	h.cv_stack_len--
-	return h.cv_stack[h.cv_stack_len]
-}
-
-func (h *Hasher) add_chunk_chaining_value(new_cv [8]uint32, total_chunks uint64) {
+func (h *Hasher) addChunkChainingValue(cv [8]uint32, totalChunks uint64) {
 	// This chunk might complete some subtrees. For each completed subtree,
 	// its left child will be the current top entry in the CV stack, and
-	// its right child will be the current value of `new_cv`. Pop each left
-	// child off the stack, merge it with `new_cv`, and overwrite `new_cv`
+	// its right child will be the current value of `cv`. Pop each left
+	// child off the stack, merge it with `cv`, and overwrite `cv`
 	// with the result. After all these merges, push the final value of
-	// `new_cv` onto the stack. The number of completed subtrees is given
+	// `cv` onto the stack. The number of completed subtrees is given
 	// by the number of trailing 0-bits in the new total number of chunks.
-	for total_chunks&1 == 0 {
-		new_cv = parent_cv(h.pop_stack(), new_cv, h.key, h.flags)
-		total_chunks >>= 1
+	right := cv
+	for totalChunks&1 == 0 {
+		// pop
+		h.stackSize--
+		left := h.chainStack[h.stackSize]
+		// merge
+		right = parentOutput(left, right, h.key, h.flags).chainingValue()
+		totalChunks >>= 1
 	}
-	h.push_stack(new_cv)
+	h.chainStack[h.stackSize] = right
+	h.stackSize++
 }
 
 // Reset implements hash.Hash.
 func (h *Hasher) Reset() {
-	h.chunk_state = newChunkState(h.key, 0, h.flags)
-	h.cv_stack_len = 0
+	h.cs = newChunkState(h.key, 0, h.flags)
+	h.stackSize = 0
 }
 
 // BlockSize implements hash.Hash.
 func (h *Hasher) BlockSize() int { return 64 }
 
 // Size implements hash.Hash.
-func (h *Hasher) Size() int { return h.out_size }
+func (h *Hasher) Size() int { return h.size }
 
 // Write implements hash.Hash.
-func (h *Hasher) Write(input []byte) (int, error) {
-	written := len(input)
-	for len(input) > 0 {
+func (h *Hasher) Write(p []byte) (int, error) {
+	lenp := len(p)
+	for len(p) > 0 {
 		// If the current chunk is complete, finalize it and reset the
-		// chunk state. More input is coming, so this chunk is not ROOT.
-		if h.chunk_state.len() == CHUNK_LEN {
-			chunk_cv := h.chunk_state.output().chaining_value()
-			total_chunks := h.chunk_state.chunk_counter + 1
-			h.add_chunk_chaining_value(chunk_cv, total_chunks)
-			h.chunk_state = newChunkState(h.key, total_chunks, h.flags)
+		// chunk state. More input is coming, so this chunk is not flagRoot.
+		if h.cs.bytesConsumed == chunkLen {
+			cv := h.cs.output().chainingValue()
+			totalChunks := h.cs.chunkCounter + 1
+			h.addChunkChainingValue(cv, totalChunks)
+			h.cs = newChunkState(h.key, totalChunks, h.flags)
 		}
 
 		// Compress input bytes into the current chunk state.
-		n := len(input)
-		if n > CHUNK_LEN-h.chunk_state.len() {
-			n = CHUNK_LEN - h.chunk_state.len()
+		n := chunkLen - h.cs.bytesConsumed
+		if n > len(p) {
+			n = len(p)
 		}
-		h.chunk_state.update(input[:n])
-		input = input[n:]
+		h.cs.update(p[:n])
+		p = p[n:]
 	}
-	return written, nil
+	return lenp, nil
 }
 
 // Sum implements hash.Hash.
-func (h *Hasher) Sum(out_slice []byte) []byte {
+func (h *Hasher) Sum(b []byte) []byte {
 	out := make([]byte, h.Size())
 	h.XOF().Read(out)
-	return append(out_slice, out...)
+	return append(b, out...)
 }
 
 // XOF returns an OutputReader initialized with the current hash state.
@@ -374,13 +344,11 @@ func (h *Hasher) XOF() *OutputReader {
 	// Starting with the output from the current chunk, compute all the
 	// parent chaining values along the right edge of the tree, until we
 	// have the root output.
-	var output = h.chunk_state.output()
-	var parent_nodes_remaining = h.cv_stack_len
-	for parent_nodes_remaining > 0 {
-		parent_nodes_remaining--
-		output = parent_output(
-			h.cv_stack[parent_nodes_remaining],
-			output.chaining_value(),
+	output := h.cs.output()
+	for i := h.stackSize - 1; i >= 0; i-- {
+		output = parentOutput(
+			h.chainStack[i],
+			output.chainingValue(),
 			h.key,
 			h.flags,
 		)
diff --git a/blake3_test.go b/blake3_test.go
index 343dbca..2079fbf 100644
--- a/blake3_test.go
+++ b/blake3_test.go
@@ -11,17 +11,7 @@ import (
 	"lukechampine.com/blake3"
 )
 
-func toHex(data []byte) string {
-	return hex.EncodeToString(data)
-}
-
-func fromHex(s string) []byte {
-	data, err := hex.DecodeString(s)
-	if err != nil {
-		panic(err)
-	}
-	return data
-}
+func toHex(data []byte) string { return hex.EncodeToString(data) }
 
 func TestVectors(t *testing.T) {
 	data, err := ioutil.ReadFile("testdata/vectors.json")
@@ -89,7 +79,7 @@ func BenchmarkWrite(b *testing.B) {
 
 func BenchmarkChunk(b *testing.B) {
 	h := blake3.New(32, nil)
-	buf := make([]byte, blake3.CHUNK_LEN)
+	buf := make([]byte, 1024)
 	out := make([]byte, 32)
 	for i := 0; i < b.N; i++ {
 		h.Write(buf)