Sector -> cluster cache, optimize _gf256_mul_x_le
This commit is contained in:
parent
1881583eea
commit
4425e81085
3 changed files with 132 additions and 105 deletions
|
@ -51,9 +51,9 @@
|
||||||
|
|
||||||
extern hekate_config h_cfg;
|
extern hekate_config h_cfg;
|
||||||
|
|
||||||
extern bool clear_sector_cache;
|
extern bool clear_cluster_cache;
|
||||||
extern bool lock_sector_cache;
|
extern bool lock_cluster_cache;
|
||||||
extern u32 secindex;
|
extern u32 cluster_cache_index;
|
||||||
|
|
||||||
u32 _key_count = 0, _titlekey_count = 0;
|
u32 _key_count = 0, _titlekey_count = 0;
|
||||||
u32 color_idx = 0;
|
u32 color_idx = 0;
|
||||||
|
@ -593,6 +593,7 @@ pkg2_done:
|
||||||
}
|
}
|
||||||
|
|
||||||
path[25] = '/';
|
path[25] = '/';
|
||||||
|
lock_cluster_cache = true;
|
||||||
while (!f_readdir(&dir, &fno) && fno.fname[0] && titles_found < title_limit) {
|
while (!f_readdir(&dir, &fno) && fno.fname[0] && titles_found < title_limit) {
|
||||||
minerva_periodic_training();
|
minerva_periodic_training();
|
||||||
memcpy(path + 26, fno.fname, 36);
|
memcpy(path + 26, fno.fname, 36);
|
||||||
|
@ -614,7 +615,6 @@ pkg2_done:
|
||||||
}
|
}
|
||||||
hash_index = 0;
|
hash_index = 0;
|
||||||
// decrypt only what is needed to locate needed keys
|
// decrypt only what is needed to locate needed keys
|
||||||
lock_sector_cache = true;
|
|
||||||
temp_file = (u8*)_nca_process(5, 4, &fp, pkg1_id->key_info.es_offset, 0xc0, key_area_key);
|
temp_file = (u8*)_nca_process(5, 4, &fp, pkg1_id->key_info.es_offset, 0xc0, key_area_key);
|
||||||
for (u32 i = 0; i <= 0xb0; ) {
|
for (u32 i = 0; i <= 0xb0; ) {
|
||||||
se_calc_sha256(temp_hash, temp_file + i, 0x10);
|
se_calc_sha256(temp_hash, temp_file + i, 0x10);
|
||||||
|
@ -631,9 +631,7 @@ pkg2_done:
|
||||||
free(temp_file);
|
free(temp_file);
|
||||||
temp_file = NULL;
|
temp_file = NULL;
|
||||||
titles_found++;
|
titles_found++;
|
||||||
lock_sector_cache = false;
|
|
||||||
} else if (_read_le_u32(dec_header, 0x210) == 0x24 && dec_header[0x205] == 0) {
|
} else if (_read_le_u32(dec_header, 0x210) == 0x24 && dec_header[0x205] == 0) {
|
||||||
lock_sector_cache = true;
|
|
||||||
temp_file = (u8*)_nca_process(5, 4, &fp, pkg1_id->key_info.ssl_offset, 0x70, key_area_key);
|
temp_file = (u8*)_nca_process(5, 4, &fp, pkg1_id->key_info.ssl_offset, 0x70, key_area_key);
|
||||||
for (u32 i = 0; i <= 0x60; i++) {
|
for (u32 i = 0; i <= 0x60; i++) {
|
||||||
se_calc_sha256(temp_hash, temp_file + i, 0x10);
|
se_calc_sha256(temp_hash, temp_file + i, 0x10);
|
||||||
|
@ -653,12 +651,12 @@ pkg2_done:
|
||||||
free(temp_file);
|
free(temp_file);
|
||||||
temp_file = NULL;
|
temp_file = NULL;
|
||||||
titles_found++;
|
titles_found++;
|
||||||
lock_sector_cache = false;
|
|
||||||
}
|
}
|
||||||
f_close(&fp);
|
f_close(&fp);
|
||||||
}
|
}
|
||||||
f_closedir(&dir);
|
f_closedir(&dir);
|
||||||
free(dec_header);
|
free(dec_header);
|
||||||
|
lock_cluster_cache = false;
|
||||||
|
|
||||||
// derive eticket_rsa_kek and ssl_rsa_kek
|
// derive eticket_rsa_kek and ssl_rsa_kek
|
||||||
if (_key_exists(es_keys[0]) && _key_exists(es_keys[1]) && _key_exists(master_key[0])) {
|
if (_key_exists(es_keys[0]) && _key_exists(es_keys[1]) && _key_exists(master_key[0])) {
|
||||||
|
@ -799,7 +797,7 @@ get_titlekeys:
|
||||||
save_ctx->file = &fp;
|
save_ctx->file = &fp;
|
||||||
save_ctx->tool_ctx.action = 0;
|
save_ctx->tool_ctx.action = 0;
|
||||||
memcpy(save_ctx->save_mac_key, save_mac_key, 0x10);
|
memcpy(save_ctx->save_mac_key, save_mac_key, 0x10);
|
||||||
clear_sector_cache = true;
|
clear_cluster_cache = true;
|
||||||
save_process_success = save_process(save_ctx);
|
save_process_success = save_process(save_ctx);
|
||||||
if (!save_process_success) {
|
if (!save_process_success) {
|
||||||
EPRINTF("Failed to process e1 save.");
|
EPRINTF("Failed to process e1 save.");
|
||||||
|
@ -876,7 +874,7 @@ get_titlekeys:
|
||||||
save_ctx->file = &fp;
|
save_ctx->file = &fp;
|
||||||
save_ctx->tool_ctx.action = 0;
|
save_ctx->tool_ctx.action = 0;
|
||||||
memcpy(save_ctx->save_mac_key, save_mac_key, 0x10);
|
memcpy(save_ctx->save_mac_key, save_mac_key, 0x10);
|
||||||
clear_sector_cache = true;
|
clear_cluster_cache = true;
|
||||||
save_process_success = save_process(save_ctx);
|
save_process_success = save_process(save_ctx);
|
||||||
if (!save_process_success) {
|
if (!save_process_success) {
|
||||||
EPRINTF("Failed to process e2 save.");
|
EPRINTF("Failed to process e2 save.");
|
||||||
|
@ -958,7 +956,7 @@ dismount:;
|
||||||
free(save_ctx);
|
free(save_ctx);
|
||||||
}
|
}
|
||||||
f_mount(NULL, "emmc:", 1);
|
f_mount(NULL, "emmc:", 1);
|
||||||
clear_sector_cache = true;
|
clear_cluster_cache = true;
|
||||||
nx_emmc_gpt_free(&gpt);
|
nx_emmc_gpt_free(&gpt);
|
||||||
|
|
||||||
key_output: ;
|
key_output: ;
|
||||||
|
|
|
@ -37,19 +37,26 @@ extern sdmmc_storage_t sd_storage;
|
||||||
extern sdmmc_storage_t storage;
|
extern sdmmc_storage_t storage;
|
||||||
extern emmc_part_t *system_part;
|
extern emmc_part_t *system_part;
|
||||||
|
|
||||||
typedef struct {
|
#define MAX_CLUSTER_CACHE_ENTRIES 128
|
||||||
u32 sector;
|
#define CLUSTER_LOOKUP_EMPTY_ENTRY 0xFFFFFFFF
|
||||||
u32 visit_count;
|
#define XTS_CLUSTER_SIZE 0x4000
|
||||||
u8 align[8];
|
#define SECTORS_PER_CLUSTER 0x20
|
||||||
u8 tweak[0x10];
|
|
||||||
u8 cached_sector[0x200];
|
|
||||||
} sector_cache_t;
|
|
||||||
|
|
||||||
#define MAX_SEC_CACHE_ENTRIES 256
|
typedef struct {
|
||||||
static sector_cache_t *sector_cache = (sector_cache_t *)(MIXD_BUF_ALIGNED + 0x100000); //NULL;
|
u32 cluster_num; // index of the cluster in the partition
|
||||||
u32 secindex = 0;
|
u32 visit_count; // used for debugging/access analysis
|
||||||
bool clear_sector_cache = false;
|
u8 dirty; // has been modified without writeback flag
|
||||||
bool lock_sector_cache = false;
|
u8 align[7];
|
||||||
|
u8 cluster[XTS_CLUSTER_SIZE]; // the cached cluster itself
|
||||||
|
} cluster_cache_t;
|
||||||
|
|
||||||
|
static cluster_cache_t *cluster_cache = (cluster_cache_t *)RAM_DISK_ADDR;
|
||||||
|
u32 cluster_cache_index = 0;
|
||||||
|
u32 *cluster_lookup = (u32 *)(RAM_DISK_ADDR + MAX_CLUSTER_CACHE_ENTRIES * sizeof(cluster_cache_t));
|
||||||
|
u8 *emmc_buffer = (u8 *)(MIXD_BUF_ALIGNED + 0x100000);
|
||||||
|
|
||||||
|
bool clear_cluster_cache = false;
|
||||||
|
bool lock_cluster_cache = false;
|
||||||
|
|
||||||
DSTATUS disk_status (
|
DSTATUS disk_status (
|
||||||
BYTE pdrv /* Physical drive number to identify the drive */
|
BYTE pdrv /* Physical drive number to identify the drive */
|
||||||
|
@ -65,21 +72,23 @@ DSTATUS disk_initialize (
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void _gf256_mul_x_le(void *block) {
|
static inline void _gf256_mul_x_le(void *block)
|
||||||
u8 *pdata = (u8 *)block;
|
{
|
||||||
|
u32 *pdata = (u32 *)block;
|
||||||
u32 carry = 0;
|
u32 carry = 0;
|
||||||
|
|
||||||
for (u32 i = 0; i < 0x10; i++) {
|
for (u32 i = 0; i < 4; i++) {
|
||||||
u8 b = pdata[i];
|
u32 b = pdata[i];
|
||||||
pdata[i] = (b << 1) | carry;
|
pdata[i] = (b << 1) | carry;
|
||||||
carry = b >> 7;
|
carry = b >> 31;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (carry)
|
if (carry)
|
||||||
pdata[0x0] ^= 0x87;
|
pdata[0x0] ^= 0x87;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_tweak, u32 tweak_exp, u64 sec, void *dst, void *src, u32 secsize) {
|
static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_tweak, u32 tweak_exp, u64 sec, void *dst, void *src, u32 secsize)
|
||||||
|
{
|
||||||
int res = 0;
|
int res = 0;
|
||||||
u8 *temptweak = (u8 *)malloc(0x10);
|
u8 *temptweak = (u8 *)malloc(0x10);
|
||||||
u32 *pdst = (u32 *)dst;
|
u32 *pdst = (u32 *)dst;
|
||||||
|
@ -95,26 +104,33 @@ static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_twe
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (u32 i = 0; i < tweak_exp * 0x20; i++)
|
// tweak_exp allows us to use a saved tweak to reduce _gf256_mul_x_le calls
|
||||||
|
for (u32 i = 0; i < tweak_exp * SECTORS_PER_CLUSTER; i++)
|
||||||
_gf256_mul_x_le(tweak);
|
_gf256_mul_x_le(tweak);
|
||||||
|
|
||||||
memcpy(temptweak, tweak, 0x10);
|
memcpy(temptweak, tweak, 0x10);
|
||||||
|
|
||||||
//We are assuming a 0x10-aligned sector size in this implementation.
|
|
||||||
for (u32 i = 0; i < secsize / 0x10; i++) {
|
// The reference implementation in IEEE P1619 encrypts once per AES block
|
||||||
|
// In this environment, doing so produces a lot of overhead
|
||||||
|
// Instead, we perform one single AES-ECB operation between the sector xors
|
||||||
|
|
||||||
|
// We are assuming a 0x10-aligned sector size in this implementation.
|
||||||
|
for (u32 i = 0; i < secsize / 0x10; i++)
|
||||||
|
{
|
||||||
for (u32 j = 0; j < 4; j++)
|
for (u32 j = 0; j < 4; j++)
|
||||||
pdst[j] = psrc[j] ^ ptweak[j];
|
pdst[j] = psrc[j] ^ ptweak[j];
|
||||||
_gf256_mul_x_le(tweak);
|
_gf256_mul_x_le(tweak);
|
||||||
psrc += 4;
|
psrc += 4;
|
||||||
pdst += 4;
|
pdst += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
se_aes_crypt_ecb(ks2, enc, dst, secsize, dst, secsize);
|
se_aes_crypt_ecb(ks2, enc, dst, secsize, dst, secsize);
|
||||||
|
|
||||||
pdst = (u32 *)dst;
|
pdst = (u32 *)dst;
|
||||||
|
|
||||||
memcpy(tweak, temptweak, 0x10);
|
memcpy(tweak, temptweak, 0x10);
|
||||||
for (u32 i = 0; i < secsize / 0x10; i++) {
|
for (u32 i = 0; i < secsize / 0x10; i++)
|
||||||
|
{
|
||||||
for (u32 j = 0; j < 4; j++)
|
for (u32 j = 0; j < 4; j++)
|
||||||
pdst[j] = pdst[j] ^ ptweak[j];
|
pdst[j] = pdst[j] ^ ptweak[j];
|
||||||
_gf256_mul_x_le(tweak);
|
_gf256_mul_x_le(tweak);
|
||||||
|
@ -138,74 +154,87 @@ DRESULT disk_read (
|
||||||
switch (pdrv)
|
switch (pdrv)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
if (((u32)buff >= DRAM_START) && !((u32)buff % 8))
|
|
||||||
return sdmmc_storage_read(&sd_storage, sector, count, buff) ? RES_OK : RES_ERROR;
|
return sdmmc_storage_read(&sd_storage, sector, count, buff) ? RES_OK : RES_ERROR;
|
||||||
u8 *buf = (u8 *)SDMMC_UPPER_BUFFER;
|
|
||||||
if (sdmmc_storage_read(&sd_storage, sector, count, buf))
|
|
||||||
{
|
|
||||||
memcpy(buff, buf, 512 * count);
|
|
||||||
return RES_OK;
|
|
||||||
}
|
|
||||||
return RES_ERROR;
|
|
||||||
|
|
||||||
case 1:;
|
case 1:;
|
||||||
__attribute__ ((aligned (16))) static u8 tweak[0x10];
|
__attribute__ ((aligned (16))) static u8 tweak[0x10];
|
||||||
__attribute__ ((aligned (16))) static u64 prev_cluster = -1;
|
__attribute__ ((aligned (16))) static u64 prev_cluster = -1;
|
||||||
__attribute__ ((aligned (16))) static u32 prev_sector = 0;
|
__attribute__ ((aligned (16))) static u32 prev_sector = 0;
|
||||||
bool needs_cache_sector = false;
|
|
||||||
|
|
||||||
if (secindex == 0 || clear_sector_cache) {
|
if (cluster_cache_index == 0 || clear_cluster_cache)
|
||||||
clear_sector_cache = false;
|
{
|
||||||
lock_sector_cache = false;
|
// memset gets optimized out...
|
||||||
secindex = 0;
|
// for (u32 i = 0; i < (system_part->lba_end - system_part->lba_start + 1) / SECTORS_PER_CLUSTER; i++)
|
||||||
|
// cluster_lookup[i] = CLUSTER_LOOKUP_EMPTY_ENTRY;
|
||||||
|
memset(cluster_lookup, -1, (system_part->lba_end - system_part->lba_start + 1) / SECTORS_PER_CLUSTER * 4);
|
||||||
|
cluster_cache_index = 0;
|
||||||
|
clear_cluster_cache = false;
|
||||||
|
lock_cluster_cache = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 s = 0;
|
u32 cluster = sector / SECTORS_PER_CLUSTER;
|
||||||
// only attempt to cache single-sector reads as these are most likely to be repeated (eg. rereading FAT)
|
u32 aligned_sector = cluster * SECTORS_PER_CLUSTER;
|
||||||
if (!lock_sector_cache && count == 1) {
|
u32 sector_index_in_cluster = sector % SECTORS_PER_CLUSTER;
|
||||||
for ( ; s < secindex; s++) {
|
u32 cluster_lookup_index = cluster_lookup[cluster];
|
||||||
if (sector_cache[s].sector == sector) {
|
|
||||||
sector_cache[s].visit_count++;
|
if (cluster_lookup_index != CLUSTER_LOOKUP_EMPTY_ENTRY)
|
||||||
memcpy(buff, sector_cache[s].cached_sector, 0x200);
|
{
|
||||||
memcpy(tweak, sector_cache[s].tweak, 0x10);
|
memcpy(buff, cluster_cache[cluster_lookup_index].cluster + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, count * NX_EMMC_BLOCKSIZE);
|
||||||
prev_sector = sector;
|
cluster_cache[cluster_lookup_index].visit_count++;
|
||||||
prev_cluster = sector / 0x20;
|
prev_sector = sector + count - 1;
|
||||||
|
prev_cluster = cluster;
|
||||||
return RES_OK;
|
return RES_OK;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
// add to cache
|
// Only cache single-sector reads as these are most likely to be repeated (eg. boot block, FAT directory tables)
|
||||||
if (s == secindex && s < MAX_SEC_CACHE_ENTRIES) {
|
if (count == 1 &&
|
||||||
sector_cache[s].sector = sector;
|
!lock_cluster_cache &&
|
||||||
sector_cache[s].visit_count++;
|
cluster_cache_index < MAX_CLUSTER_CACHE_ENTRIES &&
|
||||||
needs_cache_sector = true;
|
cluster_lookup_index == CLUSTER_LOOKUP_EMPTY_ENTRY)
|
||||||
secindex++;
|
{
|
||||||
}
|
cluster_cache[cluster_cache_index].cluster_num = cluster;
|
||||||
|
cluster_cache[cluster_cache_index].visit_count = 1;
|
||||||
|
cluster_cache[cluster_cache_index].dirty = 0;
|
||||||
|
cluster_lookup[cluster] = cluster_cache_index;
|
||||||
|
|
||||||
|
// Read and decrypt the whole cluster the sector resides in
|
||||||
|
if (!nx_emmc_part_read(&storage, system_part, aligned_sector, SECTORS_PER_CLUSTER, emmc_buffer))
|
||||||
|
return RES_ERROR;
|
||||||
|
_emmc_xts(9, 8, 0, tweak, true, 0, cluster, emmc_buffer, emmc_buffer, XTS_CLUSTER_SIZE);
|
||||||
|
memcpy(cluster_cache[cluster_cache_index].cluster, emmc_buffer, XTS_CLUSTER_SIZE);
|
||||||
|
memcpy(buff, emmc_buffer + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, NX_EMMC_BLOCKSIZE);
|
||||||
|
prev_cluster = -1;
|
||||||
|
prev_sector = 0;
|
||||||
|
cluster_cache_index++;
|
||||||
|
return RES_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nx_emmc_part_read(&storage, system_part, sector, count, buff)) {
|
if (!nx_emmc_part_read(&storage, system_part, sector, count, buff))
|
||||||
|
return RES_ERROR;
|
||||||
u32 tweak_exp = 0;
|
u32 tweak_exp = 0;
|
||||||
bool regen_tweak = true;
|
bool regen_tweak = true;
|
||||||
if (prev_cluster != sector / 0x20) { // sector in different cluster than last read
|
if (prev_cluster != cluster)
|
||||||
prev_cluster = sector / 0x20;
|
{ // Sector is in different cluster than last read
|
||||||
tweak_exp = sector % 0x20;
|
prev_cluster = cluster;
|
||||||
} else if (sector > prev_sector) { // sector in same cluster and past last sector
|
tweak_exp = sector_index_in_cluster;
|
||||||
|
}
|
||||||
|
else if (sector > prev_sector)
|
||||||
|
{ // Sector is in same cluster and past last sector
|
||||||
|
// Calculates the new tweak using the saved one, reducing expensive _gf256_mul_x_le calls
|
||||||
tweak_exp = sector - prev_sector - 1;
|
tweak_exp = sector - prev_sector - 1;
|
||||||
regen_tweak = false;
|
regen_tweak = false;
|
||||||
} else { // sector in same cluster and before or same as last sector
|
}
|
||||||
tweak_exp = sector % 0x20;
|
else
|
||||||
|
{ // Sector is in same cluster and before or same as last sector
|
||||||
|
tweak_exp = sector_index_in_cluster;
|
||||||
}
|
}
|
||||||
|
|
||||||
// fatfs will never pull more than a cluster
|
// FatFs will never pull more than one 4K cluster, which is the same as the crypto 'sector' size
|
||||||
_emmc_xts(9, 8, 0, tweak, regen_tweak, tweak_exp, prev_cluster, buff, buff, count * 0x200);
|
_emmc_xts(9, 8, 0, tweak, regen_tweak, tweak_exp, prev_cluster, buff, buff, count * NX_EMMC_BLOCKSIZE);
|
||||||
if (needs_cache_sector) {
|
|
||||||
memcpy(sector_cache[s].cached_sector, buff, 0x200);
|
|
||||||
memcpy(sector_cache[s].tweak, tweak, 0x10);
|
|
||||||
}
|
|
||||||
prev_sector = sector + count - 1;
|
prev_sector = sector + count - 1;
|
||||||
return RES_OK;
|
return RES_OK;
|
||||||
}
|
}
|
||||||
return RES_ERROR;
|
|
||||||
}
|
|
||||||
return RES_ERROR;
|
return RES_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -216,15 +245,15 @@ DRESULT disk_write (
|
||||||
UINT count /* Number of sectors to write */
|
UINT count /* Number of sectors to write */
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
if (pdrv == 1)
|
switch (pdrv)
|
||||||
return RES_WRPRT;
|
{
|
||||||
|
case 0:
|
||||||
if (((u32)buff >= DRAM_START) && !((u32)buff % 8))
|
|
||||||
return sdmmc_storage_write(&sd_storage, sector, count, (void *)buff) ? RES_OK : RES_ERROR;
|
return sdmmc_storage_write(&sd_storage, sector, count, (void *)buff) ? RES_OK : RES_ERROR;
|
||||||
u8 *buf = (u8 *)SDMMC_UPPER_BUFFER; //TODO: define this somewhere.
|
|
||||||
memcpy(buf, buff, 512 * count);
|
case 1:
|
||||||
if (sdmmc_storage_write(&sd_storage, sector, count, buf))
|
return RES_WRPRT;
|
||||||
return RES_OK;
|
}
|
||||||
|
|
||||||
return RES_ERROR;
|
return RES_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -55,14 +55,14 @@ static void _gf256_mul_x(void *block)
|
||||||
|
|
||||||
static void _gf256_mul_x_le(void *block)
|
static void _gf256_mul_x_le(void *block)
|
||||||
{
|
{
|
||||||
u8 *pdata = (u8 *)block;
|
u32 *pdata = (u32 *)block;
|
||||||
u32 carry = 0;
|
u32 carry = 0;
|
||||||
|
|
||||||
for (u32 i = 0; i < 0x10; i++)
|
for (u32 i = 0; i < 4; i++)
|
||||||
{
|
{
|
||||||
u8 b = pdata[i];
|
u32 b = pdata[i];
|
||||||
pdata[i] = (b << 1) | carry;
|
pdata[i] = (b << 1) | carry;
|
||||||
carry = b >> 7;
|
carry = b >> 31;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (carry)
|
if (carry)
|
||||||
|
|
Loading…
Reference in a new issue