Embedded SDK
Embedded SDK
cx_blake3_ref.c
Go to the documentation of this file.
1 #ifdef HAVE_BLAKE3
2 
3 #include "cx_blake3_ref.h"
4 #include "cx_utils.h"
5 #include "cx_ram.h"
6 
7 #include <string.h>
8 
9 /* ---------------------------------------------------------------------------------------------------*/
10 /* Functions from the reference implementation */
11 /* ---------------------------------------------------------------------------------------------------*/
12 
13 /********************* Compression function ************************/
14 
15 // The 'quarter-round' function G_i(a,b,c,d)
16 static void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y)
17 {
18  state[a] = state[a] + state[b] + x;
19  state[d] = rotr32(state[d] ^ state[a], 16);
20  state[c] = state[c] + state[d];
21  state[b] = rotr32(state[b] ^ state[c], 12);
22  state[a] = state[a] + state[b] + y;
23  state[d] = rotr32(state[d] ^ state[a], 8);
24  state[c] = state[c] + state[d];
25  state[b] = rotr32(state[b] ^ state[c], 7);
26 }
27 
28 // The round function F = |G_0(0,4,8,12) G_1(1,5,9,13) G_2(2,6,10,14) G_3(3,7,11,15)|
29 // |G_4(0,5,10,15) G_5(1,6,11,12) G_6(2,7,8,13) G_7(3,4,9,14) |
30 static void round_fn(uint32_t state[16], const uint32_t *msg, size_t round)
31 {
32  // Select the message schedule based on the round.
33  const uint8_t *schedule = MSG_SCHEDULE[round];
34 
35  // Mix the columns.
36  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
37  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
38  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
39  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
40 
41  // Mix the rows.
42  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
43  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
44  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
45  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
46 }
47 
48 // Compressing a block along with a chaining value
49 static void compress_pre(uint32_t *state,
50  const uint32_t *cv,
51  const uint8_t *block,
52  uint8_t block_len,
53  uint64_t counter,
54  uint8_t flags)
55 {
56  uint32_t block_words[16];
57  size_t round;
58  // First convert the block into 32-byte words
59  for (int i = 0; i < 16; i++) {
60  block_words[i] = load32(block + 4 * i);
61  }
62 
63  // Initialize the state
64  state[0] = cv[0];
65  state[1] = cv[1];
66  state[2] = cv[2];
67  state[3] = cv[3];
68  state[4] = cv[4];
69  state[5] = cv[5];
70  state[6] = cv[6];
71  state[7] = cv[7];
72  state[8] = IV[0];
73  state[9] = IV[1];
74  state[10] = IV[2];
75  state[11] = IV[3];
76  state[12] = (uint32_t) counter;
77  state[13] = (uint32_t) (counter >> 32);
78  state[14] = (uint32_t) block_len;
79  state[15] = (uint32_t) flags;
80 
81  for (round = 0; round < 7; round++) {
82  round_fn(state, &block_words[0], round);
83  }
84 }
85 
103 static void blake3_compress_in_place(uint32_t *cv,
104  const uint8_t *block,
105  uint8_t block_len,
106  uint64_t counter,
107  uint8_t flags)
108 {
109  uint32_t state[16];
110  compress_pre(&state[0], cv, block, block_len, counter, flags);
111  cv[0] = state[0] ^ state[8];
112  cv[1] = state[1] ^ state[9];
113  cv[2] = state[2] ^ state[10];
114  cv[3] = state[3] ^ state[11];
115  cv[4] = state[4] ^ state[12];
116  cv[5] = state[5] ^ state[13];
117  cv[6] = state[6] ^ state[14];
118  cv[7] = state[7] ^ state[15];
119 }
120 
138 static void blake3_compress_xof(const uint32_t *cv,
139  const uint8_t *block,
140  uint8_t block_len,
141  uint64_t counter,
142  uint8_t flags,
143  uint8_t *out)
144 {
145  uint32_t state[16];
146  compress_pre(state, cv, block, block_len, counter, flags);
147 
148  store32(&out[0 * 4], state[0] ^ state[8]);
149  store32(&out[1 * 4], state[1] ^ state[9]);
150  store32(&out[2 * 4], state[2] ^ state[10]);
151  store32(&out[3 * 4], state[3] ^ state[11]);
152  store32(&out[4 * 4], state[4] ^ state[12]);
153  store32(&out[5 * 4], state[5] ^ state[13]);
154  store32(&out[6 * 4], state[6] ^ state[14]);
155  store32(&out[7 * 4], state[7] ^ state[15]);
156  store32(&out[8 * 4], state[8] ^ cv[0]);
157  store32(&out[9 * 4], state[9] ^ cv[1]);
158  store32(&out[10 * 4], state[10] ^ cv[2]);
159  store32(&out[11 * 4], state[11] ^ cv[3]);
160  store32(&out[12 * 4], state[12] ^ cv[4]);
161  store32(&out[13 * 4], state[13] ^ cv[5]);
162  store32(&out[14 * 4], state[14] ^ cv[6]);
163  store32(&out[15 * 4], state[15] ^ cv[7]);
164 }
165 
166 /******************** Chunk state functions *********************/
178 static size_t blake3_fill_buffer(cx_blake3_state_t *chunk_state,
179  const uint8_t *input,
180  size_t input_len)
181 {
182  size_t nb_bytes = BLAKE3_BLOCK_LEN - ((size_t) chunk_state->buffer_len);
183  if (nb_bytes > input_len) {
184  nb_bytes = input_len;
185  }
186  memcpy(chunk_state->buffer + ((size_t) chunk_state->buffer_len), input, nb_bytes);
187  chunk_state->buffer_len += (uint8_t) nb_bytes;
188 
189  return nb_bytes;
190 }
191 
192 void blake3_state_init(cx_blake3_state_t *chunk_state, const uint32_t *key, uint8_t flags)
193 {
194  memcpy(chunk_state->cv, key, BLAKE3_KEY_LEN);
195  chunk_state->t = 0;
196  memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
197  chunk_state->buffer_len = 0;
198  chunk_state->blocks_compressed = 0;
199  chunk_state->d = flags;
200 }
201 
202 void blake3_state_update(cx_blake3_state_t *chunk_state, const uint8_t *input, size_t input_len)
203 {
204  size_t nb_bytes;
205  uint8_t is_start_flag = 0;
206  if (chunk_state->buffer_len > 0) {
207  nb_bytes = blake3_fill_buffer(chunk_state, input, input_len);
208  input += nb_bytes;
209  input_len -= nb_bytes;
210  if (input_len > 0) {
211  if (!chunk_state->blocks_compressed) {
212  is_start_flag = CHUNK_START;
213  }
214  blake3_compress_in_place(chunk_state->cv,
215  chunk_state->buffer,
216  BLAKE3_BLOCK_LEN,
217  chunk_state->t,
218  chunk_state->d | is_start_flag);
219  chunk_state->blocks_compressed += 1;
220  chunk_state->buffer_len = 0;
221  memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
222  }
223  }
224 
225  while (input_len > BLAKE3_BLOCK_LEN) {
226  if (!chunk_state->blocks_compressed) {
227  is_start_flag = CHUNK_START;
228  }
229  else {
230  is_start_flag = 0;
231  }
232  blake3_compress_in_place(chunk_state->cv,
233  input,
234  BLAKE3_BLOCK_LEN,
235  chunk_state->t,
236  chunk_state->d | is_start_flag);
237  chunk_state->blocks_compressed += 1;
238  input += BLAKE3_BLOCK_LEN;
239  input_len -= BLAKE3_BLOCK_LEN;
240  }
241 
242  nb_bytes = blake3_fill_buffer(chunk_state, input, input_len);
243 }
244 
245 void blake3_state_reset(cx_blake3_state_t *chunk_state, const uint32_t *key, uint64_t chunk_counter)
246 {
247  memcpy(chunk_state->cv, key, BLAKE3_KEY_LEN);
248  chunk_state->t = chunk_counter;
249  chunk_state->blocks_compressed = 0;
250  memset(chunk_state->buffer, 0, BLAKE3_BLOCK_LEN);
251  chunk_state->buffer_len = 0;
252 }
253 
254 cx_blake3_state_out_t blake3_state_output(const cx_blake3_state_t *chunk_state)
255 {
256  uint8_t is_start_flag = 0;
257  uint8_t block_flags;
258  cx_blake3_state_out_t chunk_output;
259 
260  if (!chunk_state->blocks_compressed) {
261  is_start_flag = CHUNK_START;
262  }
263  block_flags = chunk_state->d | is_start_flag | CHUNK_END;
264  memcpy(chunk_output.input_cv, chunk_state->cv, BLAKE3_OUT_LEN);
265  memcpy(chunk_output.block, chunk_state->buffer, BLAKE3_BLOCK_LEN);
266  chunk_output.block_len = chunk_state->buffer_len;
267  chunk_output.counter = chunk_state->t;
268  chunk_output.d = block_flags;
269 
270  return chunk_output;
271 }
272 
273 void blake3_output_chain(const cx_blake3_state_out_t *out, uint8_t *cv)
274 {
275  uint32_t cv_words[BLAKE3_NB_OF_WORDS];
276  memcpy(cv_words, out->input_cv, BLAKE3_WORD_SIZE);
277  blake3_compress_in_place(cv_words, out->block, out->block_len, out->counter, out->d);
278  store_cv_words(cv, cv_words);
279 }
280 
281 /********************* Hashing ************************/
303 static void blake3_hash_one(const uint8_t *input,
304  size_t blocks,
305  const uint32_t *key,
306  uint64_t counter,
307  uint8_t flags,
308  uint8_t flags_start,
309  uint8_t flags_end,
310  uint8_t *out)
311 {
312  uint32_t cv[BLAKE3_NB_OF_WORDS];
313  memcpy(cv, key, BLAKE3_KEY_LEN);
314  uint8_t block_flags = flags | flags_start;
315  while (blocks > 0) {
316  if (1 == blocks) {
317  block_flags |= flags_end;
318  }
319  blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
320  input = &input[BLAKE3_BLOCK_LEN];
321  blocks -= 1;
322  block_flags = flags;
323  }
324  store_cv_words(out, cv);
325 }
326 
350 static void blake3_hash_many(const uint8_t *const *inputs,
351  size_t num_inputs,
352  size_t blocks,
353  const uint32_t *key,
354  uint64_t counter,
355  bool increment_counter,
356  uint8_t flags,
357  uint8_t flags_start,
358  uint8_t flags_end,
359  uint8_t *out)
360 {
361  while (num_inputs > 0) {
362  blake3_hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
363  if (increment_counter) {
364  counter += 1;
365  }
366  inputs += 1;
367  num_inputs -= 1;
368  out = &out[BLAKE3_OUT_LEN];
369  }
370 }
371 
392 static size_t blake3_compress_chunks(const uint8_t *input,
393  size_t input_len,
394  const uint32_t *key,
395  uint64_t chunk_counter,
396  uint8_t flags,
397  uint8_t *out)
398 {
399  const uint8_t *chunks_array[1];
400  size_t input_position = 0;
401  size_t chunks_array_len = 0;
402  cx_blake3_state_out_t output;
403 
404  while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
405  chunks_array[chunks_array_len] = &input[input_position];
406  input_position += BLAKE3_CHUNK_LEN;
407  chunks_array_len += 1;
408  }
409 
410  blake3_hash_many(chunks_array,
411  chunks_array_len,
412  BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,
413  key,
414  chunk_counter,
415  true,
416  flags,
417  CHUNK_START,
418  CHUNK_END,
419  out);
420 
421  // Hash the remaining partial chunk, if there is one. Note that the empty
422  // chunk (meaning the empty message) is a different codepath.
423  if (input_len > input_position) {
424  uint64_t counter = chunk_counter + (uint64_t) chunks_array_len;
425  cx_blake3_state_t chunk_state;
426  blake3_state_init(&chunk_state, key, flags);
427  chunk_state.t = counter;
428  blake3_state_update(&chunk_state, input + input_position, input_len - input_position);
429  output = blake3_state_output(&chunk_state);
430  blake3_output_chain(&output, out + chunks_array_len * BLAKE3_OUT_LEN);
431  return chunks_array_len + 1;
432  }
433  else {
434  return chunks_array_len;
435  }
436 }
437 
455 static size_t blake3_compress_parents(const uint8_t *child_chaining_values,
456  size_t num_chaining_values,
457  const uint32_t *key,
458  uint8_t flags,
459  uint8_t *out)
460 {
461  const uint8_t *parents_array[2];
462  size_t parents_array_len = 0;
463 
464  while (num_chaining_values - (2 * parents_array_len) >= 2) {
465  parents_array[parents_array_len]
466  = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
467  parents_array_len += 1;
468  }
469 
470  blake3_hash_many(parents_array,
471  parents_array_len,
472  1,
473  key,
474  0, // Parents always use counter 0.
475  false,
476  flags | PARENT,
477  0, // Parents have no start flags.
478  0, // Parents have no end flags.
479  out);
480 
481  // If there's an odd child left over, it becomes an output.
482  if (num_chaining_values > 2 * parents_array_len) {
483  memcpy(out + parents_array_len * BLAKE3_OUT_LEN,
484  child_chaining_values + 2 * parents_array_len * BLAKE3_OUT_LEN,
485  BLAKE3_OUT_LEN);
486  return parents_array_len + 1;
487  }
488  else {
489  return parents_array_len;
490  }
491 }
492 
510 static size_t blake3_compress_subtree(const uint8_t *input,
511  size_t input_len,
512  const uint32_t *key,
513  uint64_t chunk_counter,
514  uint8_t flags,
515  uint8_t *out)
516 {
517  // One chunk
518  if (input_len <= BLAKE3_CHUNK_LEN) {
519  return blake3_compress_chunks(input, input_len, key, chunk_counter, flags, out);
520  }
521 
522  // At least two chunks
523  // The left subtree consists of the first 2^(10+log((n-1)/1024)), where n is the input length
524  // and he right substree consists of the remainder
525  size_t left_input_len = 1ULL << highest_one(((input_len - 1) / BLAKE3_CHUNK_LEN) | 1);
526  left_input_len = left_input_len * BLAKE3_CHUNK_LEN;
527  size_t right_input_len = input_len - left_input_len;
528  const uint8_t *right_input = &input[left_input_len];
529  uint64_t right_chunk_counter = chunk_counter + (uint64_t) (left_input_len / BLAKE3_CHUNK_LEN);
530 
531  uint8_t cv_array[2 * 2 * BLAKE3_OUT_LEN];
532  size_t degree = 1;
533  uint8_t *right_cvs;
534  size_t left_n, right_n;
535 
536  if (left_input_len > BLAKE3_CHUNK_LEN) {
537  degree = 2;
538  }
539  right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
540 
541  left_n = blake3_compress_subtree(input, left_input_len, key, chunk_counter, flags, cv_array);
542  right_n = blake3_compress_subtree(
543  right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
544 
545  if (1 == left_n) {
546  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
547  return 2;
548  }
549 
550  // Otherwise, do one layer of parent node compression.
551  size_t num_chaining_values = left_n + right_n;
552  return blake3_compress_parents(cv_array, num_chaining_values, key, flags, out);
553 }
554 
555 void blake3_compress_subtree_to_parent(const uint8_t *input,
556  size_t input_len,
557  const uint32_t *key,
558  uint64_t chunk_counter,
559  uint8_t flags,
560  uint8_t *out)
561 {
562  uint8_t cv_array[2 * BLAKE3_OUT_LEN];
563  size_t num_cvs = blake3_compress_subtree(input, input_len, key, chunk_counter, flags, cv_array);
564 
565  uint8_t out_array[BLAKE3_OUT_LEN];
566  while (num_cvs > 2) {
567  num_cvs = blake3_compress_parents(cv_array, num_cvs, key, flags, out_array);
568  memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
569  }
570  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
571 }
572 
573 void blake3_hasher_merge_cv(cx_blake3_t *hash, uint64_t total_len)
574 {
575  size_t post_merge_stack_len = (size_t) hw(total_len);
576  cx_blake3_state_out_t output;
577  uint8_t *parent_node;
578 
579  while (hash->cv_stack_len > post_merge_stack_len) {
580  parent_node = hash->cv_stack + (hash->cv_stack_len - 2) * BLAKE3_OUT_LEN;
581  memcpy(output.input_cv, hash->key, BLAKE3_OUT_LEN);
582  memcpy(output.block, parent_node, BLAKE3_BLOCK_LEN);
583  output.block_len = BLAKE3_BLOCK_LEN;
584  output.counter = 0;
585  output.d = (hash->chunk).d | PARENT;
586  blake3_output_chain(&output, parent_node);
587  hash->cv_stack_len -= 1;
588  }
589 }
590 
591 void blake3_hasher_push_cv(cx_blake3_t *hash, uint8_t *new_cv, uint64_t chunk_counter)
592 {
593  blake3_hasher_merge_cv(hash, chunk_counter);
594  memcpy(hash->cv_stack + hash->cv_stack_len * BLAKE3_OUT_LEN, new_cv, BLAKE3_OUT_LEN);
595  hash->cv_stack_len += 1;
596 }
597 
598 void blake3_output_root_bytes(const cx_blake3_state_out_t *chunk_out, uint8_t *out, size_t out_len)
599 {
600  uint64_t output_block_counter = 0;
601  size_t offset_within_block = 0;
602  uint8_t wide_buf[BLAKE3_BLOCK_LEN];
603 
604  while (out_len > 0) {
605  blake3_compress_xof(chunk_out->input_cv,
606  chunk_out->block,
607  chunk_out->block_len,
608  output_block_counter,
609  chunk_out->d | ROOT,
610  wide_buf);
611  size_t available_bytes = BLAKE3_BLOCK_LEN - offset_within_block;
612  size_t memcpy_len;
613  if (out_len > available_bytes) {
614  memcpy_len = available_bytes;
615  }
616  else {
617  memcpy_len = out_len;
618  }
619  memcpy(out, wide_buf + offset_within_block, memcpy_len);
620  out += memcpy_len;
621  out_len -= memcpy_len;
622  output_block_counter += 1;
623  offset_within_block = 0;
624  }
625 }
626 
627 void blake3_init_ctx(cx_blake3_t *hash, const uint32_t *key, uint8_t mode)
628 {
629  memcpy(hash->key, key, BLAKE3_KEY_LEN);
630  blake3_state_init(&hash->chunk, key, mode);
631  hash->cv_stack_len = 0;
632  hash->is_init = true;
633 }
634 
635 #endif // HAVE_BLAKE3
unsigned char uint8_t
Definition: usbd_conf.h:53