本文是CS:APP学习笔记的一部分:

相关的代码都放在了GitHub下了:RayZhang13/CSAPP-Labs

关于学习资源和视频等问题可以参考第一次写的Data Lab开头中提及的相关问题。

前段时间在整编译原理大作业,不得不说太硬核了,龙书也是非常精彩… 然后我的CS:APP计划又没时间做了。

这次的实验是Architecture Lab,在本次试验中,我们将了解Y86-64处理器的设计和实现,通过修改处理器的HCL描述来增加新的指令、修改循环策略等,使得修改后的处理器可以成功模拟相应的功能,并最终通过自动化测试。

准备

对应课程

很遗憾的是这次的Architecture Lab作业,貌似在上述介绍到的B站课程中,并没有覆盖到第4章(处理器体系结构)的内容,而是直接跳转到了第5章(优化程序性能),对应B站网课P10。

P.S. 别看我,B站没有第4章,CMU自己放出的原版视频也没有第4章 :(

我的建议是由于本次试验涉及Y86-64相关汇编语言的操作,对Y86进行HCL描述,以及流水线和程序优化等知识。直接去看第5章网课可能由于对流水线等知识点不熟悉而产生困惑,建议还是先把书本第4章自己看完。

课程文件

相关的作业可以在CMU的官网上找到:

Lab Assignments

在Architecture Lab一栏里面,我们可以查看相关文件,例如:

下载后并解压的文件如图所示:

使用环境

运行环境,保险起见还是使用Linux,如下为我目前使用的Linux版本和GCC对应版本:

前期准备

我们首先对sim.tar解压,进入到sim目录下,此目录下存储着Y86-64的相关工具。所有的相关工作都在此文件夹下开展。

在此文件夹下我们执行make clean; make,完成Y86-64工具的编译。

P.S. 由于我们的GCC版本太高,中途可能会遇到许多奇奇怪怪的问题,毕竟这个Lab从16年之后就再没更新过了。

  • 如果遇到multiple definition of 'lineno'; yas-grammar.o:(.bss+0x0): first defined here 这样的错误,我们需要分别进入miscpipe目录下,修改内部的Makefile,添加参数-fcommon。相关解决方案在Stack overflow上:Fail to compile the Y86 simulatur (CSAPP)
  • 如果遇到/usr/bin/ld: cannot find -ltk: No such file or directory相关报错,可能是缺少组件。在Arch Linux下,通过pacman -S tk即可解决。

每次修改完之后,我们退回到sim目录下,重新执行make clean; make

成功编译结果如下:

题目

Part A

在本部分中,我们将尝试书写一些Y86-64程序,并熟悉Y86-64相关工具。

本部分我们在文件夹sim/misc下进行,我们的目标是根据examples.c中的代码完成相应的Y86-64程序的书写。

examples.c内容如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/*
* Architecture Lab: Part A
*
* High level specs for the functions that the students will rewrite
* in Y86-64 assembly language
*/

/* $begin examples */
/* linked list element */
typedef struct ELE
{
long val;
struct ELE *next;
} * list_ptr;

/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
long val = 0;
while (ls)
{
val += ls->val;
ls = ls->next;
}
return val;
}

/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
if (!ls)
return 0;
else
{
long val = ls->val;
long rest = rsum_list(ls->next);
return val + rest;
}
}

/* copy_block - Copy src to dest and return xor checksum of src */
long copy_block(long *src, long *dest, long len)
{
long result = 0;
while (len > 0)
{
long val = *src++;
*dest++ = val;
result ^= val;
len--;
}
return result;
}
/* $end examples */

在创建了对应的ys程序后,相应的我们使用YAS汇编器输出对应的机器代码,并使用YIS指令模拟器模拟Y86-64机器代码程序的执行。

sum.ys 迭代地对链表元素进行相加

本阶段,我们需要根据examples.c中的sum_list()方法,使用Y86-64完成对应的功能。

我们对照着看一下sum_list要实现的功能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
typedef struct ELE
{
long val;
struct ELE *next;
} * list_ptr;

/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
long val = 0;
while (ls)
{
val += ls->val;
ls = ls->next;
}
return val;
}

此外在PDF中,还给出了示例链表,希望我们将该链表代入作为测试:

1
2
3
4
5
6
7
8
9
10
11
# Sample linked list
.align 8
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0

代码根据第4章 4.1.5提供的相关代码,大致写作如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Excution begins at address 0
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Excute main program
halt # Terminate program

# Sample linked list
.align 8
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0

# main function
main:
irmovq ele1, %rdi
call sum_list # sum_list(ele1)
ret

# sum_list function
sum_list:
irmovq $0, %rax # val = 0
loop:
andq %rdi, %rdi # 判断rdi地址是否有效
je out
mrmovq 0(%rdi), %rsi # rsi = *(rdi)
addq %rsi, %rax # rax += rsi
mrmovq 8(%rdi), %rdi # rdi = *(rdi + 8)
jmp loop
out:
ret

# Stack starts here and grows to lower addresses
.pos 0x200
stack:

完成了对应程序的编写,我们带入到Y86-64模拟器中尝试运行:

有兴趣的话还可以看一下生成的sum.yo的内容,左侧对应了相关的字节码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
                            | # Excution begins at address 0
0x000: | .pos 0
0x000: 30f40002000000000000 | irmovq stack, %rsp # Set up stack pointer
0x00a: 804800000000000000 | call main # Excute main program
0x013: 00 | halt # Terminate program
|
| # Sample linked list
0x018: | .align 8
0x018: | ele1:
0x018: 0a00000000000000 | .quad 0x00a
0x020: 2800000000000000 | .quad ele2
0x028: | ele2:
0x028: b000000000000000 | .quad 0x0b0
0x030: 3800000000000000 | .quad ele3
0x038: | ele3:
0x038: 000c000000000000 | .quad 0xc00
0x040: 0000000000000000 | .quad 0
|
| # main function
0x048: | main:
0x048: 30f71800000000000000 | irmovq ele1, %rdi
0x052: 805c00000000000000 | call sum_list # sum_list(ele1)
0x05b: 90 | ret
|
| # sum_list function
0x05c: | sum_list:
0x05c: 30f00000000000000000 | irmovq $0, %rax # val = 0
0x066: | loop:
0x066: 6277 | andq %rdi, %rdi # 判断rdi地址是否有效
0x068: 739000000000000000 | je out
0x071: 50670000000000000000 | mrmovq 0(%rdi), %rsi # rsi = *(rdi)
0x07b: 6060 | addq %rsi, %rax # rax += rsi
0x07d: 50770800000000000000 | mrmovq 8(%rdi), %rdi # rdi = *(rdi + 8)
0x087: 706600000000000000 | jmp loop
0x090: | out:
0x090: 90 | ret
|
| # Stack starts here and grows to lower addresses
0x200: | .pos 0x200
0x200: | stack:

最终rax输出结果为0xcba,确实为0xa + 0xb0 + 0xc00的计算结果,验证成功。

此外我们还能发现由于使用call指令,对栈空间发生的改变。

rsum.ys 递归地对链表元素进行相加

我们首先看一下要求的功能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
typedef struct ELE
{
long val;
struct ELE *next;
} * list_ptr;

/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
if (!ls)
return 0;
else
{
long val = ls->val;
long rest = rsum_list(ls->next);
return val + rest;
}
}

测试用的数据结构同上sum_list,因此与上方类似的,我们尝试写下如下的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Excution begins at address 0
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Excute main program
halt # Terminate program

# Sample linked list
.align 8
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0

# main function
main:
irmovq ele1, %rdi
call rsum_list # rsum_list(ele1)
ret

# rsum_list function
rsum_list:
pushq %rbx
irmovq $0, %rax
andq %rdi, %rdi
jne L1 # 如果节点为nil,直接返回0
irmovq $0, %rax
jmp L2
L1:
mrmovq (%rdi), %rbx
mrmovq 8(%rdi), %rdi # rdi = rdi -> next
call rsum_list
addq %rbx, %rax # node.val + rsum_list(node.next)
L2:
popq %rbx
ret

# Stack starts here and grows to lower addresses
.pos 0x200
stack:

注意到我们使用的rbx寄存器用作数据存储,遵循被调用者使用规则,简单说就是随便用,调用子函数之前要备份它,以防他被修改。而对于一个call指令前后,rbx寄存器的值应当是一致的,中间存在入栈保存现场和出栈恢复现场的过程。

代入模拟器中尝试运行:

此外rsum.yo的内容如下,仅作参考:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
                            | # Excution begins at address 0
0x000: | .pos 0
0x000: 30f40002000000000000 | irmovq stack, %rsp # Set up stack pointer
0x00a: 804800000000000000 | call main # Excute main program
0x013: 00 | halt # Terminate program
|
| # Sample linked list
0x018: | .align 8
0x018: | ele1:
0x018: 0a00000000000000 | .quad 0x00a
0x020: 2800000000000000 | .quad ele2
0x028: | ele2:
0x028: b000000000000000 | .quad 0x0b0
0x030: 3800000000000000 | .quad ele3
0x038: | ele3:
0x038: 000c000000000000 | .quad 0xc00
0x040: 0000000000000000 | .quad 0
|
| # main function
0x048: | main:
0x048: 30f71800000000000000 | irmovq ele1, %rdi
0x052: 805c00000000000000 | call rsum_list # rsum_list(ele1)
0x05b: 90 | ret
|
| # rsum_list function
0x05c: | rsum_list:
0x05c: a03f | pushq %rbx
0x05e: 30f00000000000000000 | irmovq $0, %rax
0x068: 6277 | andq %rdi, %rdi
0x06a: 748600000000000000 | jne L1 # 如果节点为nil,直接返回0
0x073: 30f00000000000000000 | irmovq $0, %rax
0x07d: 70a500000000000000 | jmp L2
0x086: | L1:
0x086: 50370000000000000000 | mrmovq (%rdi), %rbx
0x090: 50770800000000000000 | mrmovq 8(%rdi), %rdi # rdi = rdi -> next
0x09a: 805c00000000000000 | call rsum_list
0x0a3: 6030 | addq %rbx, %rax # node.val + rsum_list(node.next)
0x0a5: | L2:
0x0a5: b03f | popq %rbx
0x0a7: 90 | ret
|
| # Stack starts here and grows to lower addresses
0x200: | .pos 0x200
0x200: | stack:

结果同上,也是正确的。此外还可以观察到由于我们这次是使用递归调用的,因此使用了更多的栈空间。

copy.ys 复制数据块

我们先看C的实现逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
/* copy_block - Copy src to dest and return xor checksum of src */
long copy_block(long *src, long *dest, long len)
{
long result = 0;
while (len > 0)
{
long val = *src++;
*dest++ = val;
result ^= val;
len--;
}
return result;
}

不难看出,copy_block传入的前两个参数为两个long数组,分别表示了一一对应的源地址和目标地址,希望我们将从数据块从src复制到dest

PDF中提供了如下的测试数据:

1
2
3
4
5
6
7
8
9
10
11
.align 8
# Source block
src:
.quad 0x00a
.quad 0x0b0
.quad 0xc00
# Destination block
dest:
.quad 0x111
.quad 0x222
.quad 0x333

我们根据代码写出如下代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Excution begins at address 0
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Excute main program
halt # Terminate program

.align 8
# Source block
src:
.quad 0x00a
.quad 0x0b0
.quad 0xc00
# Destination block
dest:
.quad 0x111
.quad 0x222
.quad 0x333

# main function
main:
irmovq src, %rdi
irmovq dest, %rsi
irmovq $3, %rdx
call copy_block # copy_block(src, dest, 3)
ret

# copy_block function
copy_block:
pushq %r8
pushq %r9
pushq %rcx
irmovq $8, %r8
irmovq $1, %r9
irmovq $0, %rax
L1:
andq %rdx, %rdx
je L2
mrmovq 0(%rdi), %rcx
addq %r8, %rdi
rmmovq %rcx, (%rsi)
addq %r8, %rsi
xorq %rcx, %rax
subq %r9, %rdx
jmp L1
L2:
popq %r8
popq %r9
popq %rcx
ret

# Stack starts here and grows to lower addresses
.pos 0x200
stack:

结果如下:

经过验证0xa ^ 0xb0 ^ 0xc00结果确实为0xcba,结果成立。

copy.yo内容如下,仅供参考:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
                            | # Excution begins at address 0
0x000: | .pos 0
0x000: 30f40002000000000000 | irmovq stack, %rsp # Set up stack pointer
0x00a: 804800000000000000 | call main # Excute main program
0x013: 00 | halt # Terminate program
|
0x018: | .align 8
| # Source block
0x018: | src:
0x018: 0a00000000000000 | .quad 0x00a
0x020: b000000000000000 | .quad 0x0b0
0x028: 000c000000000000 | .quad 0xc00
| # Destination block
0x030: | dest:
0x030: 1101000000000000 | .quad 0x111
0x038: 2202000000000000 | .quad 0x222
0x040: 3303000000000000 | .quad 0x333
|
| # main function
0x048: | main:
0x048: 30f71800000000000000 | irmovq src, %rdi
0x052: 30f63000000000000000 | irmovq dest, %rsi
0x05c: 30f20300000000000000 | irmovq $3, %rdx
0x066: 807000000000000000 | call copy_block # copy_block(src, dest, 3)
0x06f: 90 | ret
|
| # copy_block function
0x070: | copy_block:
0x070: a08f | pushq %r8
0x072: a09f | pushq %r9
0x074: a01f | pushq %rcx
0x076: 30f80800000000000000 | irmovq $8, %r8
0x080: 30f90100000000000000 | irmovq $1, %r9
0x08a: 30f00000000000000000 | irmovq $0, %rax
0x094: | L1:
0x094: 6222 | andq %rdx, %rdx
0x096: 73c400000000000000 | je L2
0x09f: 50170000000000000000 | mrmovq 0(%rdi), %rcx
0x0a9: 6087 | addq %r8, %rdi
0x0ab: 40160000000000000000 | rmmovq %rcx, (%rsi)
0x0b5: 6086 | addq %r8, %rsi
0x0b7: 6310 | xorq %rcx, %rax
0x0b9: 6192 | subq %r9, %rdx
0x0bb: 709400000000000000 | jmp L1
0x0c4: | L2:
0x0c4: b08f | popq %r8
0x0c6: b09f | popq %r9
0x0c8: b01f | popq %rcx
0x0ca: 90 | ret
|
| # Stack starts here and grows to lower addresses
0x200: | .pos 0x200
0x200: | stack:

Part B

在本部分,我们在文件夹sim/seq下来进行。

该部分内容主要是为SEQ处理器添加指令IADDQ,所要修改的文件为seq-full.hcl

由于iaddq指令既与运算操作相关,又与立即数处理相关,故该指令的功能添加可以参考seq-full.hcl中的IOPQ以及IIRMOVQ来编写。

iaddq指令相关要求,详见CS:APP第三版中文版P328家庭作业4.51以及4.52,P253 练习题4.3。

sql-full.hcl目前内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#/* $begin seq-all-hcl */
####################################################################
# HCL Description of Control for Single Cycle Y86-64 Processor SEQ #
# Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2010 #
####################################################################

## Your task is to implement the iaddq instruction
## The file contains a declaration of the icodes
## for iaddq (IIADDQ)
## Your job is to add the rest of the logic to make it work

####################################################################
# C Include's. Don't alter these #
####################################################################

quote '#include <stdio.h>'
quote '#include "isa.h"'
quote '#include "sim.h"'
quote 'int sim_main(int argc, char *argv[]);'
quote 'word_t gen_pc(){return 0;}'
quote 'int main(int argc, char *argv[])'
quote ' {plusmode=0;return sim_main(argc,argv);}'

####################################################################
# Declarations. Do not change/remove/delete any of these #
####################################################################

##### Symbolic representation of Y86-64 Instruction Codes #############
wordsig INOP 'I_NOP'
wordsig IHALT 'I_HALT'
wordsig IRRMOVQ 'I_RRMOVQ'
wordsig IIRMOVQ 'I_IRMOVQ'
wordsig IRMMOVQ 'I_RMMOVQ'
wordsig IMRMOVQ 'I_MRMOVQ'
wordsig IOPQ 'I_ALU'
wordsig IJXX 'I_JMP'
wordsig ICALL 'I_CALL'
wordsig IRET 'I_RET'
wordsig IPUSHQ 'I_PUSHQ'
wordsig IPOPQ 'I_POPQ'
# Instruction code for iaddq instruction
wordsig IIADDQ 'I_IADDQ'

##### Symbolic represenations of Y86-64 function codes #####
wordsig FNONE 'F_NONE' # Default function code

##### Symbolic representation of Y86-64 Registers referenced explicitly #####
wordsig RRSP 'REG_RSP' # Stack Pointer
wordsig RNONE 'REG_NONE' # Special value indicating "no register"

##### ALU Functions referenced explicitly #####
wordsig ALUADD 'A_ADD' # ALU should add its arguments

##### Possible instruction status values #####
wordsig SAOK 'STAT_AOK' # Normal execution
wordsig SADR 'STAT_ADR' # Invalid memory address
wordsig SINS 'STAT_INS' # Invalid instruction
wordsig SHLT 'STAT_HLT' # Halt instruction encountered

##### Signals that can be referenced by control logic ####################

##### Fetch stage inputs #####
wordsig pc 'pc' # Program counter
##### Fetch stage computations #####
wordsig imem_icode 'imem_icode' # icode field from instruction memory
wordsig imem_ifun 'imem_ifun' # ifun field from instruction memory
wordsig icode 'icode' # Instruction control code
wordsig ifun 'ifun' # Instruction function
wordsig rA 'ra' # rA field from instruction
wordsig rB 'rb' # rB field from instruction
wordsig valC 'valc' # Constant from instruction
wordsig valP 'valp' # Address of following instruction
boolsig imem_error 'imem_error' # Error signal from instruction memory
boolsig instr_valid 'instr_valid' # Is fetched instruction valid?

##### Decode stage computations #####
wordsig valA 'vala' # Value from register A port
wordsig valB 'valb' # Value from register B port

##### Execute stage computations #####
wordsig valE 'vale' # Value computed by ALU
boolsig Cnd 'cond' # Branch test

##### Memory stage computations #####
wordsig valM 'valm' # Value read from memory
boolsig dmem_error 'dmem_error' # Error signal from data memory


####################################################################
# Control Signal Definitions. #
####################################################################

################ Fetch Stage ###################################

# Determine instruction code
word icode = [
imem_error: INOP;
1: imem_icode; # Default: get from instruction memory
];

# Determine instruction function
word ifun = [
imem_error: FNONE;
1: imem_ifun; # Default: get from instruction memory
];

bool instr_valid = icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };

# Does fetched instruction require a regid byte?
bool need_regids =
icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ };

# Does fetched instruction require a constant word?
bool need_valC =
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL };

################ Decode Stage ###################################

## What register should be used as the A source?
word srcA = [
icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : rA;
icode in { IPOPQ, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word srcB = [
icode in { IOPQ, IRMMOVQ, IMRMOVQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the E destination?
word dstE = [
icode in { IRRMOVQ } && Cnd : rB;
icode in { IIRMOVQ, IOPQ} : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];

## What register should be used as the M destination?
word dstM = [
icode in { IMRMOVQ, IPOPQ } : rA;
1 : RNONE; # Don't write any register
];

################ Execute Stage ###################################

## Select input A to ALU
word aluA = [
icode in { IRRMOVQ, IOPQ } : valA;
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : valC;
icode in { ICALL, IPUSHQ } : -8;
icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
IPUSHQ, IRET, IPOPQ } : valB;
icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
icode == IOPQ : ifun;
1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = icode in { IOPQ };

################ Memory Stage ###################################

## Set read control signal
bool mem_read = icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = icode in { IRMMOVQ, IPUSHQ, ICALL };

## Select memory address
word mem_addr = [
icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : valE;
icode in { IPOPQ, IRET } : valA;
# Other instructions don't need address
];

## Select memory input data
word mem_data = [
# Value from register
icode in { IRMMOVQ, IPUSHQ } : valA;
# Return PC
icode == ICALL : valP;
# Default: Don't write anything
];

## Determine instruction status
word Stat = [
imem_error || dmem_error : SADR;
!instr_valid: SINS;
icode == IHALT : SHLT;
1 : SAOK;
];

################ Program Counter Update ############################

## What address should instruction be fetched at

word new_pc = [
# Call. Use instruction constant
icode == ICALL : valC;
# Taken branch. Use instruction constant
icode == IJXX && Cnd : valC;
# Completion of RET instruction. Use value from stack
icode == IRET : valM;
# Default: Use incremented PC
1 : valP;
];
#/* $end seq-all-hcl */

在头上的注释中已经表明,在程序内已经声明了iaddq(IIADDQ)的相关icode,我们需要做的是补全相关的逻辑,使其工作。

首先,新指令iaddq格式如下,仅作参考:

其实现的功能,简单来说也就是将常数V直接加到寄存器rB中。

首先,我们不妨模仿书上的例子,将iaddq指令对应的6个阶段梳理一下:

  • 取指

    icode:ifunM1[PC]icode:ifun \gets M_1[PC]

    rA:rBM1[PC+1]rA:rB \gets M_1[PC+1]

    valCM8[PC+2]valC \gets M_8[PC+2]

    valPPC+10valP \gets PC + 10

  • 译码

    valBR[rB]valB \gets R[rB]

  • 执行

    valEvalB+valCvalE \gets valB + valC

  • 访存

    无操作

  • 写回

    R[rB]valER[rB] \gets valE

  • 更新PC

    PCvalPPC \gets valP

取指阶段 Fetch Stage

取指阶段中,有icode, ifun, instr_valid, need_regidsneed_valC共5个控制逻辑块。

icodeifun控制块不用多说,是用来从指令内存中读取逻辑块计算指令和功能码的,不必我们进行修改。

instr_valid表示这个字节是否对应于一个合法的Y86-64指令,我们通过这个信号发现不合法的指令。因此,我们需要将IIADDQ加入到其中去,表示一个合法的指令。

need_regids表示指令中是否包括一个寄存器指示符字节,很显然,根据刚刚给出的iaddq指令格式,我们显然看到了寄存器指示符字节,由0xFrB组成。因此我们需要IIADDQ加入其中。

need_valC表示指令是否包含一个常数。同上,我们观察到,iaddq指令在其2~9字节存储了一个常数,也就是我们所说的常数V,即valC。因此,需要加入IIADDQ指令。

综上,我们取指部分最终的结果为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
################ Fetch Stage     ###################################

# Determine instruction code
word icode = [
imem_error: INOP;
1: imem_icode; # Default: get from instruction memory
];

# Determine instruction function
word ifun = [
imem_error: FNONE;
1: imem_ifun; # Default: get from instruction memory
];

bool instr_valid = icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };

# Does fetched instruction require a regid byte?
bool need_regids =
icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };

# Does fetched instruction require a constant word?
bool need_valC =
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

译码和写回阶段 Decode&Writeback Stage

在译码和写回阶段中,共有srcA, srcB, dstEdstM,共4个控制逻辑块。在这里,我们处理的是寄存器的读写逻辑。

其中srcAsrcB用于控制选择两个读端口的地址输入,dstEdstM控制选择写端口的地址输入。

很明显的,在iaddq指令中,我们需要对寄存器rB进行读写操作。因此,我们需要对指令IIADDQ指定读取访问寄存器为rB,写入目标寄存器为rB。此外,还注意到IIADDQIRRMOVQ指令不同,并不需要条件Cnd信号控制。

因此,我们最终此部分的逻辑为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
################ Decode Stage    ###################################

## What register should be used as the A source?
word srcA = [
icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : rA;
icode in { IPOPQ, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word srcB = [
icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the E destination?
word dstE = [
icode in { IRRMOVQ } && Cnd : rB;
icode in { IIRMOVQ, IOPQ, IIADDQ} : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];

## What register should be used as the M destination?
word dstM = [
icode in { IMRMOVQ, IPOPQ } : rA;
1 : RNONE; # Don't write any register
];

执行阶段 Excuse Stage

在执行阶段中,涉及到aluA, aluB, alufunset_cc,共4个控制逻辑块。

aluAaluB控制块用于控制选择两个进入ALU算术/逻辑单元的数据。其中列出的操作数aluB在前,aluA在后,为了保证例如subq指令中是valB - valA,而不是相反操作。我们发现对于IIADDQ指令,需要进行的操作是valB + valC,因此我们将aluB指向valB,将aluA指向常数valC

ALU在执行阶段进行的操作,通常作为加法器来进行使用,但是对于OPq指令(例如与或非运算),有时我们希望使用指令ifun字段中编码的操作,因此我们引入alufun控制块。对于IIADDQ指令,并不需要我们进行修改,默认加法即可。

每次计算时,还有可能根据计算的结果调整条件码寄存器CC的内容(例如之前提及的CF, ZF, SF等寄存器),因此我们引入set_cc控制块来规定是否更新条件码寄存器。对于IIADDQ指令,应当更新。

因此,修改后的结果为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
################ Execute Stage   ###################################

## Select input A to ALU
word aluA = [
icode in { IRRMOVQ, IOPQ } : valA;
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
icode in { ICALL, IPUSHQ } : -8;
icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
IPUSHQ, IRET, IPOPQ, IIADDQ } : valB;
icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
icode == IOPQ : ifun;
1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = icode in { IOPQ, IIADDQ };

访存阶段 Memory Stage

访存阶段的任务是根据给定的内存地址对内存进行读写,其中有mem_read, mem_write, mem_addr, mem_dataStat,共5个控制块。

mem_read表示是否只从内存读取数据的控制信号,同理mem_read表示是否只向内存写入数据的控制信号。

mem_addr控制块用来控制选择地址数据的来源(可以选择valE或者valA)。

最后Stat控制块根据取指阶段产生的icode, imem_error, instr_valid以及数据内存产生的dmem_error信号,从指令执行的结果来计算状态码Stat

IIADDQ指令中,并不存在和访存相关的操作,因此不需修改,维持原状:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
################ Memory Stage    ###################################

## Set read control signal
bool mem_read = icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = icode in { IRMMOVQ, IPUSHQ, ICALL };

## Select memory address
word mem_addr = [
icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : valE;
icode in { IPOPQ, IRET } : valA;
# Other instructions don't need address
];

## Select memory input data
word mem_data = [
# Value from register
icode in { IRMMOVQ, IPUSHQ } : valA;
# Return PC
icode == ICALL : valP;
# Default: Don't write anything
];

## Determine instruction status
word Stat = [
imem_error || dmem_error : SADR;
!instr_valid: SINS;
icode == IHALT : SHLT;
1 : SAOK;
];

更新PC阶段 Program Counter Update

在本阶段中,我们完成对程序计数器的更新。使用了一个new_pc的控制逻辑块。

程序计数器指向下一步执行的程序地址,正常情况下如果是按照顺序顺沿,应该就是增加过后的valP。但是有时可能通过jXX等跳转指令实现跳转到其他位置,地址的来源可能是内存中读取的valM,也可能是jXX命令中带有的常数valC

new_pc的功能就是根据需要,通过分析icode(指令的类型)和Cnd的值(jXX指令的跳转条件),对valC, valM, valP进行选择控制。

在这里,我们只需要保持原状,让程序计数器顺沿更新为valP即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
################ Program Counter Update ############################

## What address should instruction be fetched at

word new_pc = [
# Call. Use instruction constant
icode == ICALL : valC;
# Taken branch. Use instruction constant
icode == IJXX && Cnd : valC;
# Completion of RET instruction. Use value from stack
icode == IRET : valM;
# Default: Use incremented PC
1 : valP;
];

测试

我们在sim/seq目录下,进行编译。使用命令make clean; make ssim VERSION=full

由于tcl内部有部分数据结构已经被弃用,编译时可能会爆出错误psim.c:853:8: error: ‘Tcl_Interp’ {aka ‘struct Tcl_Interp’} has no member named ‘result’,详见Stack overflow,最简单的处理办法就是在Makefile上的CFLAGS上添加参数-DUSE_INTERP_RESULT

此外还有可能遇到编译问题/usr/bin/ld: /tmp/ccHA7fEj.o:(.data.rel+0x0): undefined reference to matherr,详见Stack overflow - Fail to build y86-64 simulator from sources。目测可以直接将ssim.c中与matherr相关的代码注释即可,如下:

接下来,我们就成功的编译了模拟器。

我们尝试对刚才的hcl文件进行简单的Y86-64程序测试:

显示通过测试。

接下来我们进行更加全面的基准测试:

显示测试全数通过。

接下来,我们进行回归测试:

通过测试。

Part C

在本部分,我们在文件夹sim/pipe下来进行。

题目中,给定了ncopy函数的C代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
* ncopy - copy src to dst, returning number of positive ints
* contained in src array.
*/
word_t ncopy(word_t *src, word_t *dst, word_t len) {
word_t count = 0;
word_t val;

while(len > 0) {
val = *src++;
*dst++ = val;
if (val > 0)
count++;
len--;
}
return count;
}

功能为,将 src 数组复制到 dest 数组,并返回数组中的正数的总数。

此外,题目还提供了ncopy的Y86-64代码ncopy.ys

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:

Loop: mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
irmovq $1, %r10
addq %r10, %rax # count++
Npos: irmovq $1, %r10
subq %r10, %rdx # len--
irmovq $8, %r10
addq %r10, %rdi # src++
addq %r10, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

同Part B中相关内容,题目首先要求我们在pipe-full.hcl文件中实现iaddq指令。

然后通过修改的方式对ncopy.ys汇编文件和进行pipe-full.hcl效率优化。

实现IIADDQ

修改前的pipe-full.hcl如下所示,其中在文件中只是简单的对IIADDQ进行了声明。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#/* $begin pipe-all-hcl */
####################################################################
# HCL Description of Control for Pipelined Y86-64 Processor #
# Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2014 #
####################################################################

## Your task is to implement the iaddq instruction
## The file contains a declaration of the icodes
## for iaddq (IIADDQ)
## Your job is to add the rest of the logic to make it work

####################################################################
# C Include's. Don't alter these #
####################################################################

quote '#include <stdio.h>'
quote '#include "isa.h"'
quote '#include "pipeline.h"'
quote '#include "stages.h"'
quote '#include "sim.h"'
quote 'int sim_main(int argc, char *argv[]);'
quote 'int main(int argc, char *argv[]){return sim_main(argc,argv);}'

####################################################################
# Declarations. Do not change/remove/delete any of these #
####################################################################

##### Symbolic representation of Y86-64 Instruction Codes #############
wordsig INOP 'I_NOP'
wordsig IHALT 'I_HALT'
wordsig IRRMOVQ 'I_RRMOVQ'
wordsig IIRMOVQ 'I_IRMOVQ'
wordsig IRMMOVQ 'I_RMMOVQ'
wordsig IMRMOVQ 'I_MRMOVQ'
wordsig IOPQ 'I_ALU'
wordsig IJXX 'I_JMP'
wordsig ICALL 'I_CALL'
wordsig IRET 'I_RET'
wordsig IPUSHQ 'I_PUSHQ'
wordsig IPOPQ 'I_POPQ'
# Instruction code for iaddq instruction
wordsig IIADDQ 'I_IADDQ'

##### Symbolic represenations of Y86-64 function codes #####
wordsig FNONE 'F_NONE' # Default function code

##### Symbolic representation of Y86-64 Registers referenced #####
wordsig RRSP 'REG_RSP' # Stack Pointer
wordsig RNONE 'REG_NONE' # Special value indicating "no register"

##### ALU Functions referenced explicitly ##########################
wordsig ALUADD 'A_ADD' # ALU should add its arguments

##### Possible instruction status values #####
wordsig SBUB 'STAT_BUB' # Bubble in stage
wordsig SAOK 'STAT_AOK' # Normal execution
wordsig SADR 'STAT_ADR' # Invalid memory address
wordsig SINS 'STAT_INS' # Invalid instruction
wordsig SHLT 'STAT_HLT' # Halt instruction encountered

##### Signals that can be referenced by control logic ##############

##### Pipeline Register F ##########################################

wordsig F_predPC 'pc_curr->pc' # Predicted value of PC

##### Intermediate Values in Fetch Stage ###########################

wordsig imem_icode 'imem_icode' # icode field from instruction memory
wordsig imem_ifun 'imem_ifun' # ifun field from instruction memory
wordsig f_icode 'if_id_next->icode' # (Possibly modified) instruction code
wordsig f_ifun 'if_id_next->ifun' # Fetched instruction function
wordsig f_valC 'if_id_next->valc' # Constant data of fetched instruction
wordsig f_valP 'if_id_next->valp' # Address of following instruction
boolsig imem_error 'imem_error' # Error signal from instruction memory
boolsig instr_valid 'instr_valid' # Is fetched instruction valid?

##### Pipeline Register D ##########################################
wordsig D_icode 'if_id_curr->icode' # Instruction code
wordsig D_rA 'if_id_curr->ra' # rA field from instruction
wordsig D_rB 'if_id_curr->rb' # rB field from instruction
wordsig D_valP 'if_id_curr->valp' # Incremented PC

##### Intermediate Values in Decode Stage #########################

wordsig d_srcA 'id_ex_next->srca' # srcA from decoded instruction
wordsig d_srcB 'id_ex_next->srcb' # srcB from decoded instruction
wordsig d_rvalA 'd_regvala' # valA read from register file
wordsig d_rvalB 'd_regvalb' # valB read from register file

##### Pipeline Register E ##########################################
wordsig E_icode 'id_ex_curr->icode' # Instruction code
wordsig E_ifun 'id_ex_curr->ifun' # Instruction function
wordsig E_valC 'id_ex_curr->valc' # Constant data
wordsig E_srcA 'id_ex_curr->srca' # Source A register ID
wordsig E_valA 'id_ex_curr->vala' # Source A value
wordsig E_srcB 'id_ex_curr->srcb' # Source B register ID
wordsig E_valB 'id_ex_curr->valb' # Source B value
wordsig E_dstE 'id_ex_curr->deste' # Destination E register ID
wordsig E_dstM 'id_ex_curr->destm' # Destination M register ID

##### Intermediate Values in Execute Stage #########################
wordsig e_valE 'ex_mem_next->vale' # valE generated by ALU
boolsig e_Cnd 'ex_mem_next->takebranch' # Does condition hold?
wordsig e_dstE 'ex_mem_next->deste' # dstE (possibly modified to be RNONE)

##### Pipeline Register M #########################
wordsig M_stat 'ex_mem_curr->status' # Instruction status
wordsig M_icode 'ex_mem_curr->icode' # Instruction code
wordsig M_ifun 'ex_mem_curr->ifun' # Instruction function
wordsig M_valA 'ex_mem_curr->vala' # Source A value
wordsig M_dstE 'ex_mem_curr->deste' # Destination E register ID
wordsig M_valE 'ex_mem_curr->vale' # ALU E value
wordsig M_dstM 'ex_mem_curr->destm' # Destination M register ID
boolsig M_Cnd 'ex_mem_curr->takebranch' # Condition flag
boolsig dmem_error 'dmem_error' # Error signal from instruction memory

##### Intermediate Values in Memory Stage ##########################
wordsig m_valM 'mem_wb_next->valm' # valM generated by memory
wordsig m_stat 'mem_wb_next->status' # stat (possibly modified to be SADR)

##### Pipeline Register W ##########################################
wordsig W_stat 'mem_wb_curr->status' # Instruction status
wordsig W_icode 'mem_wb_curr->icode' # Instruction code
wordsig W_dstE 'mem_wb_curr->deste' # Destination E register ID
wordsig W_valE 'mem_wb_curr->vale' # ALU E value
wordsig W_dstM 'mem_wb_curr->destm' # Destination M register ID
wordsig W_valM 'mem_wb_curr->valm' # Memory M value

####################################################################
# Control Signal Definitions. #
####################################################################

################ Fetch Stage ###################################

## What address should instruction be fetched at
word f_pc = [
# Mispredicted branch. Fetch at incremented PC
M_icode == IJXX && !M_Cnd : M_valA;
# Completion of RET instruction
W_icode == IRET : W_valM;
# Default: Use predicted value of PC
1 : F_predPC;
];

## Determine icode of fetched instruction
word f_icode = [
imem_error : INOP;
1: imem_icode;
];

# Determine ifun
word f_ifun = [
imem_error : FNONE;
1: imem_ifun;
];

# Is instruction valid?
bool instr_valid = f_icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };

# Determine status code for fetched instruction
word f_stat = [
imem_error: SADR;
!instr_valid : SINS;
f_icode == IHALT : SHLT;
1 : SAOK;
];

# Does fetched instruction require a regid byte?
bool need_regids =
f_icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ };

# Does fetched instruction require a constant word?
bool need_valC =
f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL };

# Predict next value of PC
word f_predPC = [
f_icode in { IJXX, ICALL } : f_valC;
1 : f_valP;
];

################ Decode Stage ######################################


## What register should be used as the A source?
word d_srcA = [
D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : D_rA;
D_icode in { IPOPQ, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word d_srcB = [
D_icode in { IOPQ, IRMMOVQ, IMRMOVQ } : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the E destination?
word d_dstE = [
D_icode in { IRRMOVQ, IIRMOVQ, IOPQ} : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];

## What register should be used as the M destination?
word d_dstM = [
D_icode in { IMRMOVQ, IPOPQ } : D_rA;
1 : RNONE; # Don't write any register
];

## What should be the A value?
## Forward into decode stage for valA
word d_valA = [
D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC
d_srcA == e_dstE : e_valE; # Forward valE from execute
d_srcA == M_dstM : m_valM; # Forward valM from memory
d_srcA == M_dstE : M_valE; # Forward valE from memory
d_srcA == W_dstM : W_valM; # Forward valM from write back
d_srcA == W_dstE : W_valE; # Forward valE from write back
1 : d_rvalA; # Use value read from register file
];

word d_valB = [
d_srcB == e_dstE : e_valE; # Forward valE from execute
d_srcB == M_dstM : m_valM; # Forward valM from memory
d_srcB == M_dstE : M_valE; # Forward valE from memory
d_srcB == W_dstM : W_valM; # Forward valM from write back
d_srcB == W_dstE : W_valE; # Forward valE from write back
1 : d_rvalB; # Use value read from register file
];

################ Execute Stage #####################################

## Select input A to ALU
word aluA = [
E_icode in { IRRMOVQ, IOPQ } : E_valA;
E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : E_valC;
E_icode in { ICALL, IPUSHQ } : -8;
E_icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
IPUSHQ, IRET, IPOPQ } : E_valB;
E_icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
E_icode == IOPQ : E_ifun;
1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = E_icode == IOPQ &&
# State changes only during normal operation
!m_stat in { SADR, SINS, SHLT } && !W_stat in { SADR, SINS, SHLT };

## Generate valA in execute stage
word e_valA = E_valA; # Pass valA through stage

## Set dstE to RNONE in event of not-taken conditional move
word e_dstE = [
E_icode == IRRMOVQ && !e_Cnd : RNONE;
1 : E_dstE;
];

################ Memory Stage ######################################

## Select memory address
word mem_addr = [
M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : M_valE;
M_icode in { IPOPQ, IRET } : M_valA;
# Other instructions don't need address
];

## Set read control signal
bool mem_read = M_icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = M_icode in { IRMMOVQ, IPUSHQ, ICALL };

#/* $begin pipe-m_stat-hcl */
## Update the status
word m_stat = [
dmem_error : SADR;
1 : M_stat;
];
#/* $end pipe-m_stat-hcl */

## Set E port register ID
word w_dstE = W_dstE;

## Set E port value
word w_valE = W_valE;

## Set M port register ID
word w_dstM = W_dstM;

## Set M port value
word w_valM = W_valM;

## Update processor status
word Stat = [
W_stat == SBUB : SAOK;
1 : W_stat;
];

################ Pipeline Register Control #########################

# Should I stall or inject a bubble into Pipeline Register F?
# At most one of these can be true.
bool F_bubble = 0;
bool F_stall =
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB } ||
# Stalling at fetch while ret passes through pipeline
IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register D?
# At most one of these can be true.
bool D_stall =
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB };

bool D_bubble =
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Stalling at fetch while ret passes through pipeline
# but not condition for a load/use hazard
!(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register E?
# At most one of these can be true.
bool E_stall = 0;
bool E_bubble =
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB};

# Should I stall or inject a bubble into Pipeline Register M?
# At most one of these can be true.
bool M_stall = 0;
# Start injecting bubbles as soon as exception passes through memory stage
bool M_bubble = m_stat in { SADR, SINS, SHLT } || W_stat in { SADR, SINS, SHLT };

# Should I stall or inject a bubble into Pipeline Register W?
bool W_stall = W_stat in { SADR, SINS, SHLT };
bool W_bubble = 0;
#/* $end pipe-all-hcl */

同Part B中的操作相似,我们要将IIADDQ指令整合到下方的逻辑选择电路中去。

小小的区别就是,这次我们处理的是五段式指令流水线PIPE:

iaddq格式同上,我们这里再写一次,如下,仅作参考:

其实现的功能,简单来说也就是将常数V直接加到寄存器rB中。

对应的6个阶段如下:

  • 取指

    icode:ifunM1[PC]icode:ifun \gets M_1[PC]

    rA:rBM1[PC+1]rA:rB \gets M_1[PC+1]

    valCM8[PC+2]valC \gets M_8[PC+2]

    valPPC+10valP \gets PC + 10

  • 译码

    valBR[rB]valB \gets R[rB]

  • 执行

    valEvalB+valCvalE \gets valB + valC

  • 访存

    无操作

  • 写回

    R[rB]valER[rB] \gets valE

  • 更新PC

    PCvalPPC \gets valP

取指阶段 Fetch Stage

取指阶段共包含了f_pc, f_icode, f_ifun, instr_valid, f_stat, need_regids, need_valCf_predPC,共8个控制逻辑块。

f_pc功能为对选择下一条执行的指令地址,默认情况下f_pc应当选择预测的地址,除非遇到分支预测错误即M_icode == IJXX && !M_Cnd,或者ret指令即W_icode == IRET,才应当进行地址修正(与此同时还会伴随暂停和气泡信号的插入,告知后续的电路停止执行指令,防止造成数据的污染)。这里不需要我们针对IIADDQ进行任何修改。

f_icodef_ifun只有在内存读取异常时才会转为错误码,否则为内存中正常取出的计算指令和功能码。这里不需要我们针对IIADDQ进行任何修改。

instr_valid表示这个字节是否对应于一个合法的Y86-64指令,我们通过这个信号发现不合法的指令。因此,我们需要将IIADDQ加入到其中去,表示一个合法的指令。

f_stat功能为通过分析imem_error, instr_valid等信号,输出取指阶段中指令的临时状态,可输出的信号有SADR, SINS, SHLTSAOK,这些状态码可以在第三版中文版P277查询到。这里不需要我们针对IIADDQ进行任何修改。

need_regids表示指令中是否包括一个寄存器指示符字节,很显然,根据刚刚给出的iaddq指令格式,我们显然看到了寄存器指示符字节,由0xFrB组成。因此我们需要IIADDQ加入其中。

need_valC表示指令是否包含一个常数。同上,我们观察到,iaddq指令在其2~9字节存储了一个常数,也就是我们所说的常数V,即valC。因此,需要加入IIADDQ指令。`

f_predPC功能为选择预测指令地址,即从PC增加后的valP和指令自带常数valC中,选择一个。当指令为IJXXICALL等调用时,我们默认预测跳转指令成立,并选择跳转地址valC作为预测指令地址,否则就默认为当前指令的顺沿下一条地址也就是PC增加后的valP。这里不需要我们针对IIADDQ进行任何修改。

因此,修改后的结果为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
################ Fetch Stage     ###################################

## What address should instruction be fetched at
word f_pc = [
# Mispredicted branch. Fetch at incremented PC
M_icode == IJXX && !M_Cnd : M_valA;
# Completion of RET instruction
W_icode == IRET : W_valM;
# Default: Use predicted value of PC
1 : F_predPC;
];

## Determine icode of fetched instruction
word f_icode = [
imem_error : INOP;
1: imem_icode;
];

# Determine ifun
word f_ifun = [
imem_error : FNONE;
1: imem_ifun;
];

# Is instruction valid?
bool instr_valid = f_icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };

# Determine status code for fetched instruction
word f_stat = [
imem_error: SADR;
!instr_valid : SINS;
f_icode == IHALT : SHLT;
1 : SAOK;
];

# Does fetched instruction require a regid byte?
bool need_regids =
f_icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };

# Does fetched instruction require a constant word?
bool need_valC =
f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

# Predict next value of PC
word f_predPC = [
f_icode in { IJXX, ICALL } : f_valC;
1 : f_valP;
];
译码和写回阶段 Decode&Writeback Stage

P.S. 在所有指令中,只有call在访存阶段需要valP的值,只有跳转指令在执行阶段(当不需要跳转时)需要valP的值,而这些指令都不需要从寄存器文件中读出的值,因此我们可以直接合并两个信号,将他们作为信号valA携带穿过流水线。

译码和写回阶段共包含了Sel+FwdA, FwdB, d_dstE, d_dstM, d_srcAd_srcB这几个逻辑控制块。

d_srcAd_srcB用于指定需要访问读取的寄存器,而d_dstEd_dstM用于指定需要写入的寄存器目标。

Sel+FwdAFwdB主要功能为确定传向E寄存器的valAvalB信号来源,即d_valAd_valB。默认情况下这两个逻辑块应当通过srcAsrcB将寄存器文件中的内容读出,但是考虑到数据冒险,因此如果存在对寄存器进行读写的情况,我们就应该通过转发来解决,因此存在额外的5个转发通道。

很明显的,在iaddq指令中,我们需要对寄存器rB进行读写操作。因此,我们需要对指令IIADDQ指定读取访问寄存器为rB,写入目标寄存器为rB

因此,此部分修改为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
################ Decode Stage ######################################


## What register should be used as the A source?
word d_srcA = [
D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : D_rA;
D_icode in { IPOPQ, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word d_srcB = [
D_icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the E destination?
word d_dstE = [
D_icode in { IRRMOVQ, IIRMOVQ, IOPQ, IIADDQ} : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];

## What register should be used as the M destination?
word d_dstM = [
D_icode in { IMRMOVQ, IPOPQ } : D_rA;
1 : RNONE; # Don't write any register
];

## What should be the A value?
## Forward into decode stage for valA
word d_valA = [
D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC
d_srcA == e_dstE : e_valE; # Forward valE from execute
d_srcA == M_dstM : m_valM; # Forward valM from memory
d_srcA == M_dstE : M_valE; # Forward valE from memory
d_srcA == W_dstM : W_valM; # Forward valM from write back
d_srcA == W_dstE : W_valE; # Forward valE from write back
1 : d_rvalA; # Use value read from register file
];

word d_valB = [
d_srcB == e_dstE : e_valE; # Forward valE from execute
d_srcB == M_dstM : m_valM; # Forward valM from memory
d_srcB == M_dstE : M_valE; # Forward valE from memory
d_srcB == W_dstM : W_valM; # Forward valM from write back
d_srcB == W_dstE : W_valE; # Forward valE from write back
1 : d_rvalB; # Use value read from register file
];
执行阶段 Excuse Stage

执行阶段中,其硬件单元和逻辑块同SEQ中十分相似,一方面对使用的信号进行了适当的重命名,一方面标号为Set CC的逻辑以信号m_statW_stat作为输入,这个逻辑决定了是否进行条件码的更新(例如导致异常的指令通过后方流水线时,任何对条件码的更新都应该被禁止)。

aluAaluB控制块用于控制选择两个进入ALU算术/逻辑单元的数据。其中列出的操作数aluB在前,aluA在后,为了保证例如subq指令中是valB - valA,而不是相反操作。我们发现对于IIADDQ指令,需要进行的操作是valB + valC,因此我们将aluB指向valB,将aluA指向常数valC

ALU在执行阶段进行的操作,通常作为加法器来进行使用,但是对于OPq指令(例如与或非运算),有时我们希望使用指令ifun字段中编码的操作,因此我们引入alufun控制块。对于IIADDQ指令,并不需要我们进行修改,默认加法即可。

每次计算时,还有可能根据计算的结果调整条件码寄存器CC的内容(例如之前提及的CF, ZF, SF等寄存器),因此我们引入set_cc控制块来规定是否更新条件码寄存器。对于IIADDQ指令,应当更新。

修改后逻辑如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
################ Execute Stage #####################################

## Select input A to ALU
word aluA = [
E_icode in { IRRMOVQ, IOPQ } : E_valA;
E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : E_valC;
E_icode in { ICALL, IPUSHQ } : -8;
E_icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
IPUSHQ, IRET, IPOPQ, IIADDQ } : E_valB;
E_icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
E_icode == IOPQ : E_ifun;
1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = E_icode in {IOPQ, IIADDQ} &&
# State changes only during normal operation
!m_stat in { SADR, SINS, SHLT } && !W_stat in { SADR, SINS, SHLT };

## Generate valA in execute stage
word e_valA = E_valA; # Pass valA through stage

## Set dstE to RNONE in event of not-taken conditional move
word e_dstE = [
E_icode == IRRMOVQ && !e_Cnd : RNONE;
1 : E_dstE;
];
访存阶段 Memory Stage

访存阶段的任务是根据给定的内存地址对内存进行读写,其中有mem_read, mem_write, mem_addr, mem_dataStat,共5个控制块。

在上方译码过程中,我们提到了可以直接合并valAvalP两个信号,将他们作为信号valA携带穿过流水线。

总体上来说和原先的SEQ没有太大的区别。

mem_read表示是否只从内存读取数据的控制信号,同理mem_read表示是否只向内存写入数据的控制信号。

mem_addr控制块用来控制选择地址数据的来源(可以选择valE或者valA)。

最后Stat控制块根据取指阶段产生的icode, imem_error, instr_valid以及数据内存产生的dmem_error信号,从指令执行的结果来计算状态码Stat

IIADDQ指令中,并不存在和访存相关的操作,因此不需修改,维持原状:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
################ Memory Stage ######################################

## Select memory address
word mem_addr = [
M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : M_valE;
M_icode in { IPOPQ, IRET } : M_valA;
# Other instructions don't need address
];

## Set read control signal
bool mem_read = M_icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = M_icode in { IRMMOVQ, IPUSHQ, ICALL };

#/* $begin pipe-m_stat-hcl */
## Update the status
word m_stat = [
dmem_error : SADR;
1 : M_stat;
];
#/* $end pipe-m_stat-hcl */

## Set E port register ID
word w_dstE = W_dstE;

## Set E port value
word w_valE = W_valE;

## Set M port register ID
word w_dstM = W_dstM;

## Set M port value
word w_valM = W_valM;

## Update processor status
word Stat = [
W_stat == SBUB : SAOK;
1 : W_stat;
];
流水线寄存器控制 Pipeline Register Control

在出现一些特殊控制情况时,例如加载/使用冒险,处理ret指令,预测错误的分支,出现异常等,我们要对流水线进行期望的操作,需要对对应的寄存器进行暂停或者加入气泡的操作。

对应的控制操作如图:

此外,还有可能出现特殊情况组合的情形,共有两种组合情形,其处理方式如下:

很显然在此部分中不涉及IIADDQ的部分,因为其不涉及访存阶段,因而也不存在加载/使用冒险的情况。

根据以上的处理情形,我们可以得到:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
################ Pipeline Register Control #########################

# Should I stall or inject a bubble into Pipeline Register F?
# At most one of these can be true.
bool F_bubble = 0;
bool F_stall =
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB } ||
# Stalling at fetch while ret passes through pipeline
IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register D?
# At most one of these can be true.
bool D_stall =
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB };

bool D_bubble =
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Stalling at fetch while ret passes through pipeline
# but not condition for a load/use hazard
!(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register E?
# At most one of these can be true.
bool E_stall = 0;
bool E_bubble =
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB};

# Should I stall or inject a bubble into Pipeline Register M?
# At most one of these can be true.
bool M_stall = 0;
# Start injecting bubbles as soon as exception passes through memory stage
bool M_bubble = m_stat in { SADR, SINS, SHLT } || W_stat in { SADR, SINS, SHLT };

# Should I stall or inject a bubble into Pipeline Register W?
bool W_stall = W_stat in { SADR, SINS, SHLT };
bool W_bubble = 0;
测试

不管怎么说,我们先对刚才修改完成的hcl文件进行测试,我们使用命令make clean; make VERSION=full

我们在编译时也会遇到类似Part B的错误:

由于 tcl 内部有部分数据结构已经被弃用,编译时可能会爆出错误 psim.c:853:8: error: ‘Tcl_Interp’ {aka ‘struct Tcl_Interp’} has no member named ‘result’,详见 Stack overflow,最简单的处理办法就是在 Makefile 上的 CFLAGS 上添加参数 -DUSE_INTERP_RESULT

此外还有可能遇到编译问题 /usr/bin/ld: /tmp/ccHA7fEj.o:(.data.rel+0x0): undefined reference to matherr,详见 Stack overflow - Fail to build y86-64 simulator from sources。目测可以直接将 psim.c 中与 matherr 相关的代码注释即可,如下:

成功之后,我们对hcl文件进行简单的Y86-64程序测试:

显示程序通过

执行cd ../ptest; make SIM=../pipe/psim TFLAGS=-i进行回归测试:

显示成功通过

程序优化

我们观察给出的ncopy.ys代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:

Loop: mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
irmovq $1, %r10
addq %r10, %rax # count++
Npos: irmovq $1, %r10
subq %r10, %rdx # len--
irmovq $8, %r10
addq %r10, %rdi # src++
addq %r10, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

在没有做任何优化的情况下,我们尝试对原始代码进行CPE测试:

1
2
./correctness.pl
./benchmark.pl

得分惨烈。

P.S. 玄学原因,在ncopy.ys文件的最后一行请留出一行空行,不然correctness.pl运行会报错🤔

详情见"can’t find label" error in Y86 compiler

加入iaddq

我们刚好可以使用刚刚实现的iaddq指令,来实现自加自减等功能,而不需要寄存器r10来中间转换。

转换后的代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:

Loop:
mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
iaddq $1, %rax # count++
Npos:
iaddq $-1, %rdx # len--
iaddq $8, %rdi # src++
iaddq $8, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

测试 CPE结果:

循环展开&加载使用冒险

通过循环展开,我们可以:

  • 减少索引计算的次数
  • 减少条件分支的判断次数

我们尝试以6路展开为例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
andq %rdx,%rdx # len <= 0?
jmp test
Loop:
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle Loop1
iaddq $1,%rax
Loop1:
mrmovq 8(%rdi),%r10
rmmovq %r10,8(%rsi)
andq %r10,%r10
jle Loop2
iaddq $1,%rax
Loop2:
mrmovq 16(%rdi),%r10
rmmovq %r10,16(%rsi)
andq %r10,%r10
jle Loop3
iaddq $1,%rax
Loop3:
mrmovq 24(%rdi),%r10
rmmovq %r10,24(%rsi)
andq %r10,%r10
jle Loop4
iaddq $1,%rax
Loop4:
mrmovq 32(%rdi),%r10
rmmovq %r10,32(%rsi)
andq %r10,%r10
jle Loop5
iaddq $1,%rax
Loop5:
mrmovq 40(%rdi),%r10
rmmovq %r10,40(%rsi)
iaddq $48,%rdi
iaddq $48,%rsi
andq %r10,%r10
jle test
iaddq $1,%rax
test:
iaddq $-6, %rdx # 判断是否满6
jge Loop # 6路展开
iaddq $-8,%rdi
iaddq $-8,%rsi
iaddq $6, %rdx # 否则恢复原状
jmp test2 #剩下的逐次一个一个处理
Remain:
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle test2
iaddq $1,%rax
test2:
iaddq $8,%rdi
iaddq $8,%rsi
iaddq $-1, %rdx
jge Remain
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

在这样的操作下,我们通过CPE测试可以拿到18.3分:

可以看到对长度较小的数组,CPE数值还是很大,因为没有针对其进行循环展开,我们继续尝试对长度小于6的部分进行3路循环展开:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
andq %rdx,%rdx # len <= 0?
jmp test
Loop:
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle Loop1
iaddq $1,%rax
Loop1:
mrmovq 8(%rdi),%r10
rmmovq %r10,8(%rsi)
andq %r10,%r10
jle Loop2
iaddq $1,%rax
Loop2:
mrmovq 16(%rdi),%r10
rmmovq %r10,16(%rsi)
andq %r10,%r10
jle Loop3
iaddq $1,%rax
Loop3:
mrmovq 24(%rdi),%r10
rmmovq %r10,24(%rsi)
andq %r10,%r10
jle Loop4
iaddq $1,%rax
Loop4:
mrmovq 32(%rdi),%r10
rmmovq %r10,32(%rsi)
andq %r10,%r10
jle Loop5
iaddq $1,%rax
Loop5:
mrmovq 40(%rdi),%r10
rmmovq %r10,40(%rsi)
iaddq $48,%rdi
iaddq $48,%rsi
andq %r10,%r10
jle test
iaddq $1,%rax
test:
iaddq $-6, %rdx
jge Loop
iaddq $6, %rdx
jmp test2
L:
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle L1
iaddq $1,%rax
L1:
mrmovq 8(%rdi),%r10
rmmovq %r10,8(%rsi)
andq %r10,%r10
jle L2
iaddq $1,%rax
L2:
mrmovq 16(%rdi),%r10
rmmovq %r10,16(%rsi)
iaddq $24,%rdi
iaddq $24,%rsi
andq %r10,%r10
jle test2
iaddq $1,%rax
test2:
iaddq $-3, %rdx # 先减,判断够不够3个
jge L
iaddq $2, %rdx # -1则不剩了,直接Done,0 剩一个, 1剩2个
je R0
jl Done
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle R2
iaddq $1,%rax
R2:
mrmovq 8(%rdi),%r10
rmmovq %r10,8(%rsi)
andq %r10,%r10
jle Done
iaddq $1,%rax
jmp Done
R0:
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle Done
iaddq $1,%rax
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

通过对剩余的长度小于6的数组进行3路循环展开优化,我们进一步提升了CPE测试结果:

消除气泡

在刚才的程序中,根据注释提示不难看出,存在加载/使用冒险:

1
2
mrmovq (%rdi), %r10	# read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst

流水线为了正常运作,由于访存阶段处于较为靠后的阶段,下方指令需要等待上方指令运行到访存阶段才能正确使用寄存器值,因此存在一个气泡的浪费。也就是:

1
2
3
mrmovq (%rdi), %r10	# read val from src...
# bubble...
rmmovq %r10, (%rsi) # ...and store it to dst

我们可以考虑将这两个操作分开?中间插入一些其他没有依赖先后关系的后续操作,例如直接操作下一个位置的val,由于r10寄存器还有用,我们暂用r11存储。这样既消除了气泡,同时又没有浪费性能,只是将下一步的操作提前了:

1
2
3
4
mrmovq (%rdi), %r10
mrmovq 8(%rdi), %r11
rmmovq %r10, (%rsi)
rmmovq %r11, 8(%rsi)

最终的结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
andq %rdx,%rdx # len <= 0?
jmp test
Loop:
mrmovq (%rdi),%r10
mrmovq 8(%rdi),%r11
rmmovq %r10,(%rsi)
rmmovq %r11,8(%rsi)
andq %r10,%r10
jle Loop1
iaddq $1,%rax
Loop1:
andq %r11,%r11
jle Loop2
iaddq $1,%rax
Loop2:
mrmovq 16(%rdi),%r10
mrmovq 24(%rdi),%r11
rmmovq %r10,16(%rsi)
rmmovq %r11,24(%rsi)
andq %r10,%r10
jle Loop3
iaddq $1,%rax
Loop3:
andq %r11,%r11
jle Loop4
iaddq $1,%rax
Loop4:
mrmovq 32(%rdi),%r10
mrmovq 40(%rdi),%r11
rmmovq %r10,32(%rsi)
rmmovq %r11,40(%rsi)
andq %r10,%r10
jle Loop5
iaddq $1,%rax
Loop5:
iaddq $48,%rdi
iaddq $48,%rsi
andq %r11,%r11
jle test
iaddq $1,%rax
test:
iaddq $-6, %rdx
jge Loop
iaddq $6, %rdx
jmp test2
L:
mrmovq (%rdi),%r10
mrmovq 8(%rdi),%r11
rmmovq %r10,(%rsi)
rmmovq %r11,8(%rsi)
andq %r10,%r10
jle L1
iaddq $1,%rax
L1:
andq %r11,%r11
jle L2
iaddq $1,%rax
L2:
mrmovq 16(%rdi),%r10
iaddq $24,%rdi
rmmovq %r10,16(%rsi)
iaddq $24,%rsi
andq %r10,%r10
jle test2
iaddq $1,%rax
test2:
iaddq $-3, %rdx # 先减,判断够不够3个
jge L
iaddq $2, %rdx # -1则不剩了,直接Done,0 剩一个, 1剩2个
je R0
jl Done
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle R2
iaddq $1,%rax
R2:
mrmovq 8(%rdi),%r10
rmmovq %r10,8(%rsi)
andq %r10,%r10
jle Done
iaddq $1,%rax
jmp Done
R0:
mrmovq (%rdi),%r10
rmmovq %r10,(%rsi)
andq %r10,%r10
jle Done
iaddq $1,%rax
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

最后大概有三处加载/使用冒险,额,还是没想好怎么解决🤔

总而言之,最后优化成了如下,CPE测试47分:

题目在PDF中要求的CPE低于9.009.00倒是达成了,更好奇CMU老教授是怎么做到7.507.50以下的了。

总结

花了点零零碎碎的时间,通读了CS:APP的第四章,书里面循序渐进地从SEQ硬件,到PIPE-硬件,最后得到完整的PIPE硬件结构。虽然说书看起来有点头痛,但是耐下心去看其实还不错,成就感还是挺强的。