Embedded SDK
Embedded SDK
Loading...
Searching...
No Matches
cx_blake3_ref.c
Go to the documentation of this file.
1#ifdef HAVE_BLAKE3
2
3#include "cx_blake3_ref.h"
4#include "cx_utils.h"
5#include "cx_ram.h"
6
7#include <string.h>
8
9/* ---------------------------------------------------------------------------------------------------*/
10/* Functions from the reference implementation */
11/* ---------------------------------------------------------------------------------------------------*/
12
13/********************* Compression function ************************/
14
15// The 'quarter-round' function G_i(a,b,c,d)
16static void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y)
17{
18 state[a] = state[a] + state[b] + x;
19 state[d] = rotr32(state[d] ^ state[a], 16);
20 state[c] = state[c] + state[d];
21 state[b] = rotr32(state[b] ^ state[c], 12);
22 state[a] = state[a] + state[b] + y;
23 state[d] = rotr32(state[d] ^ state[a], 8);
24 state[c] = state[c] + state[d];
25 state[b] = rotr32(state[b] ^ state[c], 7);
26}
27
28// The round function F = |G_0(0,4,8,12) G_1(1,5,9,13) G_2(2,6,10,14) G_3(3,7,11,15)|
29// |G_4(0,5,10,15) G_5(1,6,11,12) G_6(2,7,8,13) G_7(3,4,9,14) |
30static void round_fn(uint32_t state[16], const uint32_t *msg, size_t round)
31{
32 // Select the message schedule based on the round.
33 const uint8_t *schedule = MSG_SCHEDULE[round];
34
35 // Mix the columns.
36 g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
37 g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
38 g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
39 g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
40
41 // Mix the rows.
42 g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
43 g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
44 g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
45 g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
46}
47
48// Compressing a block along with a chaining value
49static void compress_pre(uint32_t *state,
50 const uint32_t *cv,
51 const uint8_t *block,
52 uint8_t block_len,
53 uint64_t counter,
54 uint8_t flags)
55{
56 uint32_t block_words[16];
57 size_t round;
58 // First convert the block into 32-byte words
59 for (int i = 0; i < 16; i++) {
60 block_words[i] = load32(block + 4 * i);
61 }
62
63 // Initialize the state
64 state[0] = cv[0];
65 state[1] = cv[1];
66 state[2] = cv[2];
67 state[3] = cv[3];
68 state[4] = cv[4];
69 state[5] = cv[5];
70 state[6] = cv[6];
71 state[7] = cv[7];
72 state[8] = IV[0];
73 state[9] = IV[1];
74 state[10] = IV[2];
75 state[11] = IV[3];
76 state[12] = (uint32_t) counter;
77 state[13] = (uint32_t) (counter >> 32);
78 state[14] = (uint32_t) block_len;
79 state[15] = (uint32_t) flags;
80
81 for (round = 0; round < 7; round++) {
82 round_fn(state, &block_words[0], round);
83 }
84}
85
103static void blake3_compress_in_place(uint32_t *cv,
104 const uint8_t *block,
105 uint8_t block_len,
106 uint64_t counter,
107 uint8_t flags)
108{
109 uint32_t state[16];
110 compress_pre(&state[0], cv, block, block_len, counter, flags);
111 cv[0] = state[0] ^ state[8];
112 cv[1] = state[1] ^ state[9];
113 cv[2] = state[2] ^ state[10];
114 cv[3] = state[3] ^ state[11];
115 cv[4] = state[4] ^ state[12];
116 cv[5] = state[5] ^ state[13];
117 cv[6] = state[6] ^ state[14];
118 cv[7] = state[7] ^ state[15];
119}
120
138static void blake3_compress_xof(const uint32_t *cv,
139 const uint8_t *block,
140 uint8_t block_len,
141 uint64_t counter,
142 uint8_t flags,
143 uint8_t *out)
144{
145 uint32_t state[16];
146 compress_pre(state, cv, block, block_len, counter, flags);
147
148 store32(&out[0 * 4], state[0] ^ state[8]);
149 store32(&out[1 * 4], state[1] ^ state[9]);
150 store32(&out[2 * 4], state[2] ^ state[10]);
151 store32(&out[3 * 4], state[3] ^ state[11]);
152 store32(&out[4 * 4], state[4] ^ state[12]);
153 store32(&out[5 * 4], state[5] ^ state[13]);
154 store32(&out[6 * 4], state[6] ^ state[14]);
155 store32(&out[7 * 4], state[7] ^ state[15]);
156 store32(&out[8 * 4], state[8] ^ cv[0]);
157 store32(&out[9 * 4], state[9] ^ cv[1]);
158 store32(&out[10 * 4], state[10] ^ cv[2]);
159 store32(&out[11 * 4], state[11] ^ cv[3]);
160 store32(&out[12 * 4], state[12] ^ cv[4]);
161 store32(&out[13 * 4], state[13] ^ cv[5]);
162 store32(&out[14 * 4], state[14] ^ cv[6]);
163 store32(&out[15 * 4], state[15] ^ cv[7]);
164}
165
166/******************** Chunk state functions *********************/
178static size_t blake3_fill_buffer(cx_blake3_state_t *chunk_state,
179 const uint8_t *input,
180 size_t input_len)
181{
182 size_t nb_bytes = BLAKE3_BLOCK_LEN - ((size_t) chunk_state->buffer_len);
183 if (nb_bytes > input_len) {
184 nb_bytes = input_len;
185 }
186 memcpy(chunk_state->buffer + ((size_t) chunk_state->buffer_len), input, nb_bytes);
187 chunk_state->buffer_len += (uint8_t) nb_bytes;
188
189 return nb_bytes;
190}
191
192void blake3_state_init(cx_blake3_state_t *chunk_state, const uint32_t *key, uint8_t flags)
193{
194 memcpy(chunk_state->cv, key, BLAKE3_KEY_LEN);
195 chunk_state->t = 0;
196 memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
197 chunk_state->buffer_len = 0;
198 chunk_state->blocks_compressed = 0;
199 chunk_state->d = flags;
200}
201
202void blake3_state_update(cx_blake3_state_t *chunk_state, const uint8_t *input, size_t input_len)
203{
204 size_t nb_bytes;
205 uint8_t is_start_flag = 0;
206 if (chunk_state->buffer_len > 0) {
207 nb_bytes = blake3_fill_buffer(chunk_state, input, input_len);
208 input += nb_bytes;
209 input_len -= nb_bytes;
210 if (input_len > 0) {
211 if (!chunk_state->blocks_compressed) {
212 is_start_flag = CHUNK_START;
213 }
214 blake3_compress_in_place(chunk_state->cv,
215 chunk_state->buffer,
216 BLAKE3_BLOCK_LEN,
217 chunk_state->t,
218 chunk_state->d | is_start_flag);
219 chunk_state->blocks_compressed += 1;
220 chunk_state->buffer_len = 0;
221 memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
222 }
223 }
224
225 while (input_len > BLAKE3_BLOCK_LEN) {
226 if (!chunk_state->blocks_compressed) {
227 is_start_flag = CHUNK_START;
228 }
229 else {
230 is_start_flag = 0;
231 }
232 blake3_compress_in_place(chunk_state->cv,
233 input,
234 BLAKE3_BLOCK_LEN,
235 chunk_state->t,
236 chunk_state->d | is_start_flag);
237 chunk_state->blocks_compressed += 1;
238 input += BLAKE3_BLOCK_LEN;
239 input_len -= BLAKE3_BLOCK_LEN;
240 }
241
242 nb_bytes = blake3_fill_buffer(chunk_state, input, input_len);
243}
244
245void blake3_state_reset(cx_blake3_state_t *chunk_state, const uint32_t *key, uint64_t chunk_counter)
246{
247 memcpy(chunk_state->cv, key, BLAKE3_KEY_LEN);
248 chunk_state->t = chunk_counter;
249 chunk_state->blocks_compressed = 0;
250 memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
251 chunk_state->buffer_len = 0;
252}
253
254cx_blake3_state_out_t blake3_state_output(const cx_blake3_state_t *chunk_state)
255{
256 uint8_t is_start_flag = 0;
257 uint8_t block_flags;
258 cx_blake3_state_out_t chunk_output;
259
260 if (!chunk_state->blocks_compressed) {
261 is_start_flag = CHUNK_START;
262 }
263 block_flags = chunk_state->d | is_start_flag | CHUNK_END;
264 memcpy(chunk_output.input_cv, chunk_state->cv, BLAKE3_OUT_LEN);
265 memcpy(chunk_output.block, chunk_state->buffer, BLAKE3_BLOCK_LEN);
266 chunk_output.block_len = chunk_state->buffer_len;
267 chunk_output.counter = chunk_state->t;
268 chunk_output.d = block_flags;
269
270 return chunk_output;
271}
272
273void blake3_output_chain(const cx_blake3_state_out_t *out, uint8_t *cv)
274{
275 uint32_t cv_words[BLAKE3_NB_OF_WORDS];
276 memcpy(cv_words, out->input_cv, BLAKE3_WORD_SIZE);
277 blake3_compress_in_place(cv_words, out->block, out->block_len, out->counter, out->d);
278 store_cv_words(cv, cv_words);
279}
280
281/********************* Hashing ************************/
303static void blake3_hash_one(const uint8_t *input,
304 size_t blocks,
305 const uint32_t *key,
306 uint64_t counter,
307 uint8_t flags,
308 uint8_t flags_start,
309 uint8_t flags_end,
310 uint8_t *out)
311{
312 uint32_t cv[BLAKE3_NB_OF_WORDS];
313 memcpy(cv, key, BLAKE3_KEY_LEN);
314 uint8_t block_flags = flags | flags_start;
315 while (blocks > 0) {
316 if (1 == blocks) {
317 block_flags |= flags_end;
318 }
319 blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
320 input = &input[BLAKE3_BLOCK_LEN];
321 blocks -= 1;
322 block_flags = flags;
323 }
324 store_cv_words(out, cv);
325}
326
350static void blake3_hash_many(const uint8_t *const *inputs,
351 size_t num_inputs,
352 size_t blocks,
353 const uint32_t *key,
354 uint64_t counter,
355 bool increment_counter,
356 uint8_t flags,
357 uint8_t flags_start,
358 uint8_t flags_end,
359 uint8_t *out)
360{
361 while (num_inputs > 0) {
362 blake3_hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
363 if (increment_counter) {
364 counter += 1;
365 }
366 inputs += 1;
367 num_inputs -= 1;
368 out = &out[BLAKE3_OUT_LEN];
369 }
370}
371
392static size_t blake3_compress_chunks(const uint8_t *input,
393 size_t input_len,
394 const uint32_t *key,
395 uint64_t chunk_counter,
396 uint8_t flags,
397 uint8_t *out)
398{
399 const uint8_t *chunks_array[1];
400 size_t input_position = 0;
401 size_t chunks_array_len = 0;
402 cx_blake3_state_out_t output;
403
404 while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
405 chunks_array[chunks_array_len] = &input[input_position];
406 input_position += BLAKE3_CHUNK_LEN;
407 chunks_array_len += 1;
408 }
409
410 blake3_hash_many(chunks_array,
411 chunks_array_len,
412 BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,
413 key,
414 chunk_counter,
415 true,
416 flags,
417 CHUNK_START,
418 CHUNK_END,
419 out);
420
421 // Hash the remaining partial chunk, if there is one. Note that the empty
422 // chunk (meaning the empty message) is a different codepath.
423 if (input_len > input_position) {
424 uint64_t counter = chunk_counter + (uint64_t) chunks_array_len;
425 cx_blake3_state_t chunk_state;
426 blake3_state_init(&chunk_state, key, flags);
427 chunk_state.t = counter;
428 blake3_state_update(&chunk_state, input + input_position, input_len - input_position);
429 output = blake3_state_output(&chunk_state);
430 blake3_output_chain(&output, out + chunks_array_len * BLAKE3_OUT_LEN);
431 return chunks_array_len + 1;
432 }
433 else {
434 return chunks_array_len;
435 }
436}
437
455static size_t blake3_compress_parents(const uint8_t *child_chaining_values,
456 size_t num_chaining_values,
457 const uint32_t *key,
458 uint8_t flags,
459 uint8_t *out)
460{
461 const uint8_t *parents_array[2];
462 size_t parents_array_len = 0;
463
464 while (num_chaining_values - (2 * parents_array_len) >= 2) {
465 parents_array[parents_array_len]
466 = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
467 parents_array_len += 1;
468 }
469
470 blake3_hash_many(parents_array,
471 parents_array_len,
472 1,
473 key,
474 0, // Parents always use counter 0.
475 false,
476 flags | PARENT,
477 0, // Parents have no start flags.
478 0, // Parents have no end flags.
479 out);
480
481 // If there's an odd child left over, it becomes an output.
482 if (num_chaining_values > 2 * parents_array_len) {
483 memcpy(out + parents_array_len * BLAKE3_OUT_LEN,
484 child_chaining_values + 2 * parents_array_len * BLAKE3_OUT_LEN,
485 BLAKE3_OUT_LEN);
486 return parents_array_len + 1;
487 }
488 else {
489 return parents_array_len;
490 }
491}
492
510static size_t blake3_compress_subtree(const uint8_t *input,
511 size_t input_len,
512 const uint32_t *key,
513 uint64_t chunk_counter,
514 uint8_t flags,
515 uint8_t *out)
516{
517 // One chunk
518 if (input_len <= BLAKE3_CHUNK_LEN) {
519 return blake3_compress_chunks(input, input_len, key, chunk_counter, flags, out);
520 }
521
522 // At least two chunks
523 // The left subtree consists of the first 2^(10+log((n-1)/1024)), where n is the input length
524 // and he right substree consists of the remainder
525 size_t left_input_len = 1ULL << highest_one(((input_len - 1) / BLAKE3_CHUNK_LEN) | 1);
526 left_input_len = left_input_len * BLAKE3_CHUNK_LEN;
527 size_t right_input_len = input_len - left_input_len;
528 const uint8_t *right_input = &input[left_input_len];
529 uint64_t right_chunk_counter = chunk_counter + (uint64_t) (left_input_len / BLAKE3_CHUNK_LEN);
530
531 uint8_t cv_array[2 * 2 * BLAKE3_OUT_LEN];
532 size_t degree = 1;
533 uint8_t *right_cvs;
534 size_t left_n, right_n;
535
536 if (left_input_len > BLAKE3_CHUNK_LEN) {
537 degree = 2;
538 }
539 right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
540
541 left_n = blake3_compress_subtree(input, left_input_len, key, chunk_counter, flags, cv_array);
542 right_n = blake3_compress_subtree(
543 right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
544
545 if (1 == left_n) {
546 memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
547 return 2;
548 }
549
550 // Otherwise, do one layer of parent node compression.
551 size_t num_chaining_values = left_n + right_n;
552 return blake3_compress_parents(cv_array, num_chaining_values, key, flags, out);
553}
554
555void blake3_compress_subtree_to_parent(const uint8_t *input,
556 size_t input_len,
557 const uint32_t *key,
558 uint64_t chunk_counter,
559 uint8_t flags,
560 uint8_t *out)
561{
562 uint8_t cv_array[2 * BLAKE3_OUT_LEN];
563 size_t num_cvs = blake3_compress_subtree(input, input_len, key, chunk_counter, flags, cv_array);
564
565 uint8_t out_array[BLAKE3_OUT_LEN];
566 while (num_cvs > 2) {
567 num_cvs = blake3_compress_parents(cv_array, num_cvs, key, flags, out_array);
568 memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
569 }
570 memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
571}
572
573void blake3_hasher_merge_cv(cx_blake3_t *hash, uint64_t total_len)
574{
575 size_t post_merge_stack_len = (size_t) hw(total_len);
576 cx_blake3_state_out_t output;
577 uint8_t *parent_node;
578
579 while (hash->cv_stack_len > post_merge_stack_len) {
580 parent_node = hash->cv_stack + (hash->cv_stack_len - 2) * BLAKE3_OUT_LEN;
581 memcpy(output.input_cv, hash->key, BLAKE3_OUT_LEN);
582 memcpy(output.block, parent_node, BLAKE3_BLOCK_LEN);
583 output.block_len = BLAKE3_BLOCK_LEN;
584 output.counter = 0;
585 output.d = (hash->chunk).d | PARENT;
586 blake3_output_chain(&output, parent_node);
587 hash->cv_stack_len -= 1;
588 }
589}
590
591void blake3_hasher_push_cv(cx_blake3_t *hash, uint8_t *new_cv, uint64_t chunk_counter)
592{
593 blake3_hasher_merge_cv(hash, chunk_counter);
594 memcpy(hash->cv_stack + hash->cv_stack_len * BLAKE3_OUT_LEN, new_cv, BLAKE3_OUT_LEN);
595 hash->cv_stack_len += 1;
596}
597
598void blake3_output_root_bytes(const cx_blake3_state_out_t *chunk_out, uint8_t *out, size_t out_len)
599{
600 uint64_t output_block_counter = 0;
601 size_t offset_within_block = 0;
602 uint8_t wide_buf[BLAKE3_BLOCK_LEN];
603
604 while (out_len > 0) {
605 blake3_compress_xof(chunk_out->input_cv,
606 chunk_out->block,
607 chunk_out->block_len,
608 output_block_counter,
609 chunk_out->d | ROOT,
610 wide_buf);
611 size_t available_bytes = BLAKE3_BLOCK_LEN - offset_within_block;
612 size_t memcpy_len;
613 if (out_len > available_bytes) {
614 memcpy_len = available_bytes;
615 }
616 else {
617 memcpy_len = out_len;
618 }
619 memcpy(out, wide_buf + offset_within_block, memcpy_len);
620 out += memcpy_len;
621 out_len -= memcpy_len;
622 output_block_counter += 1;
623 offset_within_block = 0;
624 }
625}
626
627void blake3_init_ctx(cx_blake3_t *hash, const uint32_t *key, uint8_t mode)
628{
629 memcpy(hash->key, key, BLAKE3_KEY_LEN);
630 blake3_state_init(&hash->chunk, key, mode);
631 hash->cv_stack_len = 0;
632 hash->is_init = true;
633}
634
635#endif // HAVE_BLAKE3
unsigned char uint8_t
Definition usbd_conf.h:53