2022
Oswald, Nicolai; Nagarajan, Vijay; Sorin, Daniel J.; Gavrielatos, Vasilis; Olausson, Theo; Carr, Reece
HeteroGen: Automatic Synthesis of Heterogeneous Cache Coherence Protocols Inproceedings
In: 28th IEEE International Symposium on High-Performance Computer Architecture (HPCA-28), IEEE Press, Seoul, South Korea, 2022.
@inproceedings{nokey,
title = {HeteroGen: Automatic Synthesis of Heterogeneous Cache Coherence Protocols},
author = {Nicolai Oswald and Vijay Nagarajan and Daniel J. Sorin and Vasilis Gavrielatos and Theo Olausson and Reece Carr},
url = {https://github.com/Errare-humanum-est/HeteroGen},
doi = {https://doi.org/10.5281/zenodo.5826339},
year = {2022},
date = {2022-04-02},
urldate = {2022-04-02},
booktitle = {28th IEEE International Symposium on High-Performance Computer Architecture (HPCA-28)},
publisher = {IEEE Press},
address = {Seoul, South Korea},
abstract = {We solve the two challenges architects face when designing heterogeneous processors with cache coherent shared memory. First, we develop an automated tool, called HeteroGen, for composing clusters of cores, each with its own coherence protocol. Second, we show that the output of HeteroGen adheres to a precisely defined memory consistency model that we call a compound consistency model. For a wide variety of protocols --- including the MOESI variants, as well as those that are targeted towards Total Store Order and Release Consistency --- we show that HeteroGen can correctly fuse them. To validate HeteroGen, we develop the first litmus tests for verifying that heterogeneous protocols satisfy compound consistency models. To understand the possible performance implications of automatic protocol generation, we compared against a publicly available manually-generated heterogeneous protocol. Our results show that performance is comparable.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Patil, Adarsh; Nagarajan, Vijay; Balasubramonian, Rajeev; Oswald, Nicolai
Dvé: Improving DRAM Reliability and Performance On-Demand via Coherent Replication Inproceedings
In: 48th ACM/IEEE Annual International Symposium on Computer Architecture, ISCA 2021, Valencia, Spain, June 14-18, 2021, pp. 526–539, IEEE, 2021.
@inproceedings{DBLP:conf/isca/PatilNBO21,
title = {Dvé: Improving DRAM Reliability and Performance On-Demand via Coherent Replication},
author = {Adarsh Patil and Vijay Nagarajan and Rajeev Balasubramonian and Nicolai Oswald},
url = {https://github.com/Errare-humanum-est/Dve_Protocol_Model_Check},
doi = {10.1109/ISCA52012.2021.00048},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {48th ACM/IEEE Annual International Symposium on Computer Architecture,
ISCA 2021, Valencia, Spain, June 14-18, 2021},
pages = {526--539},
publisher = {IEEE},
abstract = {As technologies continue to shrink, memory system failure rates have increased, demanding support for stronger forms of reliability. In this work, we take inspiration from the two-tier approach that decouples correction from detection and explore a novel extrapolation. We propose Dvé, a hardware-driven replication mechanism where data blocks are replicated in 2 different sockets across a cache-coherent NUMA system. Each data block is also accompanied by a code with strong error detection capabilities so that when an error is detected, correction is performed using the replica. Such an organization has the advantage of offering two independent points of access to data which enables: (a) strong error correction that can recover from a range of faults affecting any of the components in the memory, upto and including the memory controller, and (b) higher performance by providing another nearer point of memory access. Dvé realizes both of these benefits via Coherent Replication, a technique that builds on top of existing cache coherence protocols for not only keeping the replicas in sync for reliability, but also to provide coherent access to the replicas during fault-free operation for performance. Dvé can flexibly provide these benefits on-demand by simply using the provisioned memory capacity which, as reported in recent studies, is often underutilized in today’s systems. Thus, Dvé introduces a unique design point that offers higher reliability and performance for workloads that do not require the entire memory capacity.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Oswald, Nicolai; Nagarajan, Vijay; Sorin, Daniel J
HieraGen: Automated Generation of Concurrent, Hierarchical Cache Coherence Protocols Inproceedings
In: 2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA), pp. 888-899, 2020.
@inproceedings{9138912,
title = {HieraGen: Automated Generation of Concurrent, Hierarchical Cache Coherence Protocols},
author = {Nicolai Oswald and Vijay Nagarajan and Daniel J Sorin},
url = {https://github.com/Errare-humanum-est/HieraGen},
doi = {10.1109/ISCA45697.2020.00077},
year = {2020},
date = {2020-05-01},
urldate = {2020-05-01},
booktitle = {2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA)},
pages = {888-899},
abstract = {We present HieraGen, a new tool for automatically generating hierarchical cache coherence protocols. HieraGen's inputs are the simple, atomic, stable state protocols for each level of the hierarchy. HieraGen's output is a highly concurrent hierarchical protocol, in the form of the finite state machines for all of the cache and directory controllers. HieraGen thus reduces the complexity that architects face, by offloading the challenging tasks of composing protocols and managing concurrency. Experiments show that HieraGen can automatically generate correct-by-construction MOESI family of hierarchical protocols with dozens of states and hundreds of transitions. We have verified all of the generated protocols for safety and deadlock freedom using a model checker.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2018
Oswald, Nicolai; Nagarajan, Vijay; Sorin, Daniel J
Protogen: Automatically Generating Directory Cache Coherence Protocols from Atomic Specifications Inproceedings
In: Proceedings of the 45th Annual International Symposium on Computer Architecture, pp. 247–260, IEEE Press, Los Angeles, California, 2018, ISBN: 9781538659847.
@inproceedings{10.1109/ISCA.2018.00030,
title = {Protogen: Automatically Generating Directory Cache Coherence Protocols from Atomic Specifications},
author = {Nicolai Oswald and Vijay Nagarajan and Daniel J Sorin},
url = {https://github.com/Errare-humanum-est/ProtoGen},
doi = {10.1109/ISCA.2018.00030},
isbn = {9781538659847},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-01},
booktitle = {Proceedings of the 45th Annual International Symposium on Computer Architecture},
pages = {247–260},
publisher = {IEEE Press},
address = {Los Angeles, California},
series = {ISCA '18},
abstract = {Designing directory cache coherence protocols is complicated because coherence transactions are not atomic in modern multicore processors. A coherence transaction comprises multiple messages, and these messages can interleave with other conflicting coherence transactions initiated by other cores. To overcome this architectural challenge, we present ProtoGen, an automated tool for taking the description of a directory protocol with atomic transactions (i.e., no concurrency) and generating the corresponding protocol for a multicore with non-atomic transactions. ProtoGen outputs the finite state machines for the cache and directory controllers, including all of the transient states that are possible with concurrent transactions. We have used ProtoGen to generate complete MSI, MESI, and MOSI protocols given their stable state protocol specifications. We have verified the generated protocols for safety and deadlock freedom using the Murϕ model checker. Our generated protocols are identical to or better than manually generated protocols, at times even discovering opportunities to reduce stalling.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Gavrielatos, Vasilis; Katsarakis, Antonios; Joshi, Arpit; Oswald, Nicolai; Grot, Boris; Nagarajan, Vijay
Scale-out CcNUMA: Exploiting Skew with Strongly Consistent Caching Inproceedings
In: Proceedings of the Thirteenth EuroSys Conference, Association for Computing Machinery, Porto, Portugal, 2018, ISBN: 9781450355841.
@inproceedings{10.1145/3190508.3190550,
title = {Scale-out CcNUMA: Exploiting Skew with Strongly Consistent Caching},
author = {Vasilis Gavrielatos and Antonios Katsarakis and Arpit Joshi and Nicolai Oswald and Boris Grot and Vijay Nagarajan},
url = {https://github.com/Errare-humanum-est/Linearization-Protocol},
doi = {10.1145/3190508.3190550},
isbn = {9781450355841},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-01},
booktitle = {Proceedings of the Thirteenth EuroSys Conference},
publisher = {Association for Computing Machinery},
address = {Porto, Portugal},
series = {EuroSys '18},
abstract = {Today's cloud based online services are underpinned by distributed key-value stores (KVS). Such KVS typically use a scale-out architecture, whereby the dataset is partitioned across a pool of servers, each holding a chunk of the dataset in memory and being responsible for serving queries against the chunk. One important performance bottleneck that a KVS design must address is the load imbalance caused by skewed popularity distributions. Despite recent work on skew mitigation, existing approaches offer only limited benefit for high-throughput in-memory KVS deployments.In this paper, we embrace popularity skew as a performance opportunity. Our insight is that aggressively caching popular items at all nodes of the KVS enables both load balance and high throughput - a combination that has eluded previous approaches. We introduce symmetric caching, wherein every server node is provisioned with a small cache that maintains the most popular objects in the dataset. To ensure consistency across the caches, we use high-throughput fully-distributed consistency protocols. A key result of this work is that strong consistency guarantees (per-key linearizability) need not compromise on performance. In a 9-node RDMA-based rack and with modest write ratios, our prototype design, dubbed ccKVS, achieves 2.2x the throughput of the state-of-the-art KVS while guaranteeing strong consistency.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}