16 static void g(uint32_t *state,
size_t a,
size_t b,
size_t c,
size_t d, uint32_t x, uint32_t y)
18 state[a] = state[a] + state[b] + x;
19 state[d] = rotr32(state[d] ^ state[a], 16);
20 state[c] = state[c] + state[d];
21 state[b] = rotr32(state[b] ^ state[c], 12);
22 state[a] = state[a] + state[b] + y;
23 state[d] = rotr32(state[d] ^ state[a], 8);
24 state[c] = state[c] + state[d];
25 state[b] = rotr32(state[b] ^ state[c], 7);
30 static void round_fn(uint32_t state[16],
const uint32_t *msg,
size_t round)
33 const uint8_t *schedule = MSG_SCHEDULE[round];
36 g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
37 g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
38 g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
39 g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
42 g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
43 g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
44 g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
45 g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
49 static void compress_pre(uint32_t *state,
56 uint32_t block_words[16];
59 for (
int i = 0; i < 16; i++) {
60 block_words[i] = load32(block + 4 * i);
76 state[12] = (uint32_t) counter;
77 state[13] = (uint32_t) (counter >> 32);
78 state[14] = (uint32_t) block_len;
79 state[15] = (uint32_t) flags;
81 for (round = 0; round < 7; round++) {
82 round_fn(state, &block_words[0], round);
103 static void blake3_compress_in_place(uint32_t *cv,
110 compress_pre(&state[0], cv, block, block_len, counter, flags);
111 cv[0] = state[0] ^ state[8];
112 cv[1] = state[1] ^ state[9];
113 cv[2] = state[2] ^ state[10];
114 cv[3] = state[3] ^ state[11];
115 cv[4] = state[4] ^ state[12];
116 cv[5] = state[5] ^ state[13];
117 cv[6] = state[6] ^ state[14];
118 cv[7] = state[7] ^ state[15];
138 static void blake3_compress_xof(
const uint32_t *cv,
146 compress_pre(state, cv, block, block_len, counter, flags);
148 store32(&out[0 * 4], state[0] ^ state[8]);
149 store32(&out[1 * 4], state[1] ^ state[9]);
150 store32(&out[2 * 4], state[2] ^ state[10]);
151 store32(&out[3 * 4], state[3] ^ state[11]);
152 store32(&out[4 * 4], state[4] ^ state[12]);
153 store32(&out[5 * 4], state[5] ^ state[13]);
154 store32(&out[6 * 4], state[6] ^ state[14]);
155 store32(&out[7 * 4], state[7] ^ state[15]);
156 store32(&out[8 * 4], state[8] ^ cv[0]);
157 store32(&out[9 * 4], state[9] ^ cv[1]);
158 store32(&out[10 * 4], state[10] ^ cv[2]);
159 store32(&out[11 * 4], state[11] ^ cv[3]);
160 store32(&out[12 * 4], state[12] ^ cv[4]);
161 store32(&out[13 * 4], state[13] ^ cv[5]);
162 store32(&out[14 * 4], state[14] ^ cv[6]);
163 store32(&out[15 * 4], state[15] ^ cv[7]);
178 static size_t blake3_fill_buffer(cx_blake3_state_t *chunk_state,
182 size_t nb_bytes = BLAKE3_BLOCK_LEN - ((size_t) chunk_state->buffer_len);
183 if (nb_bytes > input_len) {
184 nb_bytes = input_len;
186 memcpy(chunk_state->buffer + ((
size_t) chunk_state->buffer_len), input, nb_bytes);
187 chunk_state->buffer_len += (
uint8_t) nb_bytes;
192 void blake3_state_init(cx_blake3_state_t *chunk_state,
const uint32_t *key,
uint8_t flags)
194 memcpy(chunk_state->cv, key, BLAKE3_KEY_LEN);
196 memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
197 chunk_state->buffer_len = 0;
198 chunk_state->blocks_compressed = 0;
199 chunk_state->d = flags;
202 void blake3_state_update(cx_blake3_state_t *chunk_state,
const uint8_t *input,
size_t input_len)
206 if (chunk_state->buffer_len > 0) {
207 nb_bytes = blake3_fill_buffer(chunk_state, input, input_len);
209 input_len -= nb_bytes;
211 if (!chunk_state->blocks_compressed) {
212 is_start_flag = CHUNK_START;
214 blake3_compress_in_place(chunk_state->cv,
218 chunk_state->d | is_start_flag);
219 chunk_state->blocks_compressed += 1;
220 chunk_state->buffer_len = 0;
221 memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
225 while (input_len > BLAKE3_BLOCK_LEN) {
226 if (!chunk_state->blocks_compressed) {
227 is_start_flag = CHUNK_START;
232 blake3_compress_in_place(chunk_state->cv,
236 chunk_state->d | is_start_flag);
237 chunk_state->blocks_compressed += 1;
238 input += BLAKE3_BLOCK_LEN;
239 input_len -= BLAKE3_BLOCK_LEN;
242 nb_bytes = blake3_fill_buffer(chunk_state, input, input_len);
245 void blake3_state_reset(cx_blake3_state_t *chunk_state,
const uint32_t *key, uint64_t chunk_counter)
247 memcpy(chunk_state->cv, key, BLAKE3_KEY_LEN);
248 chunk_state->t = chunk_counter;
249 chunk_state->blocks_compressed = 0;
250 memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
251 chunk_state->buffer_len = 0;
254 cx_blake3_state_out_t blake3_state_output(
const cx_blake3_state_t *chunk_state)
258 cx_blake3_state_out_t chunk_output;
260 if (!chunk_state->blocks_compressed) {
261 is_start_flag = CHUNK_START;
263 block_flags = chunk_state->d | is_start_flag | CHUNK_END;
264 memcpy(chunk_output.input_cv, chunk_state->cv, BLAKE3_OUT_LEN);
265 memcpy(chunk_output.block, chunk_state->buffer, BLAKE3_BLOCK_LEN);
266 chunk_output.block_len = chunk_state->buffer_len;
267 chunk_output.counter = chunk_state->t;
268 chunk_output.d = block_flags;
273 void blake3_output_chain(
const cx_blake3_state_out_t *out,
uint8_t *cv)
275 uint32_t cv_words[BLAKE3_NB_OF_WORDS];
276 memcpy(cv_words, out->input_cv, BLAKE3_WORD_SIZE);
277 blake3_compress_in_place(cv_words, out->block, out->block_len, out->counter, out->d);
278 store_cv_words(cv, cv_words);
303 static void blake3_hash_one(
const uint8_t *input,
312 uint32_t cv[BLAKE3_NB_OF_WORDS];
313 memcpy(cv, key, BLAKE3_KEY_LEN);
314 uint8_t block_flags = flags | flags_start;
317 block_flags |= flags_end;
319 blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
320 input = &input[BLAKE3_BLOCK_LEN];
324 store_cv_words(out, cv);
350 static void blake3_hash_many(
const uint8_t *
const *inputs,
355 bool increment_counter,
361 while (num_inputs > 0) {
362 blake3_hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
363 if (increment_counter) {
368 out = &out[BLAKE3_OUT_LEN];
392 static size_t blake3_compress_chunks(
const uint8_t *input,
395 uint64_t chunk_counter,
399 const uint8_t *chunks_array[1];
400 size_t input_position = 0;
401 size_t chunks_array_len = 0;
402 cx_blake3_state_out_t output;
404 while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
405 chunks_array[chunks_array_len] = &input[input_position];
406 input_position += BLAKE3_CHUNK_LEN;
407 chunks_array_len += 1;
410 blake3_hash_many(chunks_array,
412 BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,
423 if (input_len > input_position) {
424 uint64_t counter = chunk_counter + (uint64_t) chunks_array_len;
425 cx_blake3_state_t chunk_state;
426 blake3_state_init(&chunk_state, key, flags);
427 chunk_state.t = counter;
428 blake3_state_update(&chunk_state, input + input_position, input_len - input_position);
429 output = blake3_state_output(&chunk_state);
430 blake3_output_chain(&output, out + chunks_array_len * BLAKE3_OUT_LEN);
431 return chunks_array_len + 1;
434 return chunks_array_len;
455 static size_t blake3_compress_parents(
const uint8_t *child_chaining_values,
456 size_t num_chaining_values,
461 const uint8_t *parents_array[2];
462 size_t parents_array_len = 0;
464 while (num_chaining_values - (2 * parents_array_len) >= 2) {
465 parents_array[parents_array_len]
466 = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
467 parents_array_len += 1;
470 blake3_hash_many(parents_array,
482 if (num_chaining_values > 2 * parents_array_len) {
483 memcpy(out + parents_array_len * BLAKE3_OUT_LEN,
484 child_chaining_values + 2 * parents_array_len * BLAKE3_OUT_LEN,
486 return parents_array_len + 1;
489 return parents_array_len;
510 static size_t blake3_compress_subtree(
const uint8_t *input,
513 uint64_t chunk_counter,
518 if (input_len <= BLAKE3_CHUNK_LEN) {
519 return blake3_compress_chunks(input, input_len, key, chunk_counter, flags, out);
525 size_t left_input_len = 1ULL << highest_one(((input_len - 1) / BLAKE3_CHUNK_LEN) | 1);
526 left_input_len = left_input_len * BLAKE3_CHUNK_LEN;
527 size_t right_input_len = input_len - left_input_len;
528 const uint8_t *right_input = &input[left_input_len];
529 uint64_t right_chunk_counter = chunk_counter + (uint64_t) (left_input_len / BLAKE3_CHUNK_LEN);
531 uint8_t cv_array[2 * 2 * BLAKE3_OUT_LEN];
534 size_t left_n, right_n;
536 if (left_input_len > BLAKE3_CHUNK_LEN) {
539 right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
541 left_n = blake3_compress_subtree(input, left_input_len, key, chunk_counter, flags, cv_array);
542 right_n = blake3_compress_subtree(
543 right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
546 memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
551 size_t num_chaining_values = left_n + right_n;
552 return blake3_compress_parents(cv_array, num_chaining_values, key, flags, out);
555 void blake3_compress_subtree_to_parent(
const uint8_t *input,
558 uint64_t chunk_counter,
562 uint8_t cv_array[2 * BLAKE3_OUT_LEN];
563 size_t num_cvs = blake3_compress_subtree(input, input_len, key, chunk_counter, flags, cv_array);
565 uint8_t out_array[BLAKE3_OUT_LEN];
566 while (num_cvs > 2) {
567 num_cvs = blake3_compress_parents(cv_array, num_cvs, key, flags, out_array);
568 memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
570 memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
573 void blake3_hasher_merge_cv(cx_blake3_t *hash, uint64_t total_len)
575 size_t post_merge_stack_len = (size_t) hw(total_len);
576 cx_blake3_state_out_t output;
579 while (hash->cv_stack_len > post_merge_stack_len) {
580 parent_node = hash->cv_stack + (hash->cv_stack_len - 2) * BLAKE3_OUT_LEN;
581 memcpy(output.input_cv, hash->key, BLAKE3_OUT_LEN);
582 memcpy(output.block, parent_node, BLAKE3_BLOCK_LEN);
583 output.block_len = BLAKE3_BLOCK_LEN;
585 output.d = (hash->chunk).d | PARENT;
586 blake3_output_chain(&output, parent_node);
587 hash->cv_stack_len -= 1;
591 void blake3_hasher_push_cv(cx_blake3_t *hash,
uint8_t *new_cv, uint64_t chunk_counter)
593 blake3_hasher_merge_cv(hash, chunk_counter);
594 memcpy(hash->cv_stack + hash->cv_stack_len * BLAKE3_OUT_LEN, new_cv, BLAKE3_OUT_LEN);
595 hash->cv_stack_len += 1;
598 void blake3_output_root_bytes(
const cx_blake3_state_out_t *chunk_out,
uint8_t *out,
size_t out_len)
600 uint64_t output_block_counter = 0;
601 size_t offset_within_block = 0;
602 uint8_t wide_buf[BLAKE3_BLOCK_LEN];
604 while (out_len > 0) {
605 blake3_compress_xof(chunk_out->input_cv,
607 chunk_out->block_len,
608 output_block_counter,
611 size_t available_bytes = BLAKE3_BLOCK_LEN - offset_within_block;
613 if (out_len > available_bytes) {
614 memcpy_len = available_bytes;
617 memcpy_len = out_len;
619 memcpy(out, wide_buf + offset_within_block, memcpy_len);
621 out_len -= memcpy_len;
622 output_block_counter += 1;
623 offset_within_block = 0;
627 void blake3_init_ctx(cx_blake3_t *hash,
const uint32_t *key,
uint8_t mode)
629 memcpy(hash->key, key, BLAKE3_KEY_LEN);
630 blake3_state_init(&hash->chunk, key, mode);
631 hash->cv_stack_len = 0;
632 hash->is_init =
true;