#include #include #include "inc/prefetcher.h" // Submission ID: 3 // Paper title: A Best-Offset Prefetcher // Author: Pierre Michaud //###################################################################################### // PREFETCHER PARAMETERS //###################################################################################### // Because prefetch cannot cross 4KB-page boundaries, there is no need to consider offsets // greater than 63. However, with pages larger than 4KB, it would be beneficial to consider // larger offsets. #define NOFFSETS 46 int OFFSET[NOFFSETS] = {1,-1,2,-2,3,-3,4,-4,5,-5,6,-6,7,-7,8,-8,9,-9,10,-10,11,-11,12,-12,13,-13,14,-14,15,-15,16,-16,18,-18,20,-20,24,-24,30,-30,32,-32,36,-36,40,-40}; #define DEFAULT_OFFSET 1 #define SCORE_MAX 31 #define ROUND_MAX 100 #define RRINDEX 6 #define RRTAG 12 #define DELAYQSIZE 15 #define DELAY 60 #define TIME_BITS 12 #define LLC_RATE_MAX 255 #define GAUGE_MAX 8191 #define MSHR_THRESHOLD_MAX (L2_MSHR_COUNT-4) #define MSHR_THRESHOLD_MIN 2 #define LOW_SCORE 20 #define BAD_SCORE ((knob_small_llc)? 10 : 1) #define BANDWIDTH ((knob_low_bandwidth)? 64 : 16) //###################################################################################### // PREFETCHER STATE //###################################################################################### int prefetch_offset; // 7 bits (6-bit value + 1 sign bit) // Recent Requests (RR) table: 2 banks, 64 entries per bank, RRTAG bits per entry int recent_request[2][1<> 6) == 0) #define INCREMENT(x,n) {x++; if (x==(n)) x=0;} #define TRUNCATE(x,nbits) (((x) & ((1<<(nbits))-1))) typedef long long t_addr; //###################################################################################### // RECENT REQUESTS TABLE (RR) //###################################################################################### void rr_init() { int i; for (i=0; i<(1<>RRINDEX,RRTAG); } int rr_index_left(t_addr lineaddr) { return TRUNCATE(lineaddr^(lineaddr>>RRINDEX),RRINDEX); } int rr_index_right(t_addr lineaddr) { return TRUNCATE(lineaddr^(lineaddr>>(2*RRINDEX)),RRINDEX); } void rr_insert_left(t_addr lineaddr) { int i = rr_index_left(lineaddr); recent_request[0][i] = rr_tag(lineaddr); } void rr_insert_right(t_addr lineaddr) { int i = rr_index_right(lineaddr); recent_request[1][i] = rr_tag(lineaddr); } int rr_hit(t_addr lineaddr) { int i = rr_index_left(lineaddr); int j = rr_index_right(lineaddr); int tag = rr_tag(lineaddr); return (recent_request[0][i] == tag) || (recent_request[1][j] == tag); } //###################################################################################### // DELAY QUEUE (DQ) //###################################################################################### // Without the delay queue, the prefetcher would always try to select an offset value // large enough for having timely prefetches. However, sometimes, a small offset yields // late prefetches but greater prefetch accuracy and better performance. The delay queue // is an imperfect solution to this problem. // This implementation of the delay queue is specific to the DPC2 simulator, as the DPC2 // prefetcher can act only at certain clock cycles. In a real processor, the delay queue // implementation can be simpler. void dq_init() { int i; for (i=0; i= issuecycle) { return (cycle < issuecycle) || (cycle >= readycycle); } else { return (cycle < issuecycle) && (cycle >= readycycle); } } void dq_pop() { // dequeue the entries that are ready to be dequeued, // and do a write in the "left" bank of the RR table for each of them int i; for (i=0; i LOW_SCORE) || (pt.llc_rate > (2*BANDWIDTH))) { // prefetch accuracy not too bad, or low bandwidth requirement // ==> maximum prefetch aggressiveness pt.mshr_threshold = MSHR_THRESHOLD_MAX; } else if (pt.llc_rate < BANDWIDTH) { // LLC access rate exceeds memory bandwidth, implying that there are some LLC hits. // If there are more LLC misses than hits, perhaps memory bandwidth saturates. // If there are more LLC hits than misses, the MSHR is probably not stressed. // So we set the MSHR threshold low. pt.mshr_threshold = MSHR_THRESHOLD_MIN; } else { // in-between situation: we set the MSHR threshold proportionally to the (inverse) LLC rate pt.mshr_threshold = MSHR_THRESHOLD_MIN + (MSHR_THRESHOLD_MAX-MSHR_THRESHOLD_MIN) * (double) (pt.llc_rate - BANDWIDTH) / BANDWIDTH; } } // The pt_llc_access function estimates the average time between consecutive LLC accesses. // It is called on every LLC access. void pt_llc_access() { // update the gauge int cycle = TRUNCATE(get_current_cycle(0),TIME_BITS); int dt = TRUNCATE(cycle - pt.last_cycle,TIME_BITS); pt.last_cycle = cycle; pt.llc_rate_gauge += dt - pt.llc_rate; // if the gauge reaches its upper limit, increment the rate counter // if the gauge reaches its lower limit, decrement the rate counter // otherwise leave the rate counter unchanged if (pt.llc_rate_gauge > GAUGE_MAX) { pt.llc_rate_gauge = GAUGE_MAX; if (pt.llc_rate < LLC_RATE_MAX) { pt.llc_rate++; pt_update_mshr_threshold(); } } else if (pt.llc_rate_gauge < 0) { pt.llc_rate_gauge = 0; if (pt.llc_rate > 0) { pt.llc_rate--; pt_update_mshr_threshold(); } } } //###################################################################################### // OFFSETS SCORES (OS) //###################################################################################### // A method for determining the best offset value void os_reset() { int i; for (i=0; i increment the score os.score[os.p]++; if (os.score[os.p] >= os.max_score) { os.max_score = os.score[os.p]; os.best_offset = testoffset; } } if (os.p == (NOFFSETS-1)) { // one round finished os.round++; if ((os.max_score == SCORE_MAX) || (os.round == ROUND_MAX)) { // learning phase is finished, update the prefetch offset prefetch_offset = (os.best_offset != 0)? os.best_offset : DEFAULT_OFFSET; pt.prefetch_score = os.max_score; pt_update_mshr_threshold(); if (os.max_score <= BAD_SCORE) { // prefetch accuracy is likely to be very low ==> turn the prefetch off prefetch_offset = 0; } // new learning phase starts os_reset(); return; } } INCREMENT(os.p,NOFFSETS); // prepare to test the next offset } //###################################################################################### // OFFSET PREFETCHER //###################################################################################### // Issue at most one prefetch request. The prefetch line address is obtained by adding // the prefetch offset to the current line address int issue_prefetch(t_addr lineaddr, int offset) { if (offset == 0) { // The prefetcher is currently turned off. // Just push the line address into the delay queue for best-offset learning. dq_push(lineaddr); return 0; } if (! SAMEPAGE(lineaddr,lineaddr+offset)) { // crossing the page boundary, no prefetch request issued return 0; } if (get_l2_mshr_occupancy(0) < pt.mshr_threshold) { // prefetch into L2 dq_push(lineaddr); return l2_prefetch_line(0,lineaddr< LOW_SCORE) { return l2_prefetch_line(0,lineaddr<> LOGLINE; int s = l2_get_set(addr); int w = l2_get_way(0,addr,s); int l2_hit = (w>=0); int prefetched = 0; if (l2_hit) { // read the prefetch bit, and reset it prefetched = prefetch_bit[s][w]; prefetch_bit[s][w] = 0; } else { pt_llc_access(); } dq_pop(); int prefetch_issued = 0; if (! l2_hit || prefetched) { os_learn_best_offset(lineaddr); prefetch_issued = issue_prefetch(lineaddr,prefetch_offset); if (prefetch_issued) { // assume the prefetch request is a L2 miss (we don't know actually) pt_llc_access(); } } } void l2_cache_fill(int cpu_num, unsigned long long int addr, int set, int way, int prefetch, unsigned long long int evicted_addr) { // In this version of the DPC2 simulator, the "prefetch" boolean passed // as input here is not reset whenever a demand request hits in the L2 // MSHR on an in-flight prefetch request. Fortunately, this is the information // we need for updating the RR table for best-offset learning. // However, the prefetch bit stored in the L2 is not completely accurate // (though hopefully this does not impact performance too much). // In a real hardware implementation of the BO prefetcher, we would distinguish // "prefetched" and "demand-requested", which are independent informations. t_addr lineaddr = addr >> LOGLINE; // write the prefetch bit int s = l2_get_set(addr); int w = l2_get_way(0,addr,s); prefetch_bit[s][w] = prefetch; // write the "right" bank of the RR table t_addr baselineaddr; if (prefetch || (prefetch_offset == 0)) { baselineaddr = lineaddr - prefetch_offset; if (SAMEPAGE(lineaddr,baselineaddr)) { rr_insert_right(baselineaddr); } } } void l2_prefetcher_heartbeat_stats(int cpu_num) { } void l2_prefetcher_warmup_stats(int cpu_num) { } void l2_prefetcher_final_stats(int cpu_num) { }