CS:APP Architecture Lab实验

本文是CS:APP学习笔记的一部分：

相关的代码都放在了GitHub下了：RayZhang13/CSAPP-Labs

关于学习资源和视频等问题可以参考第一次写的Data Lab开头中提及的相关问题。

前段时间在整编译原理大作业，不得不说太硬核了，龙书也是非常精彩… 然后我的CS:APP计划又没时间做了。

这次的实验是Architecture Lab，在本次试验中，我们将了解Y86-64处理器的设计和实现，通过修改处理器的HCL描述来增加新的指令、修改循环策略等，使得修改后的处理器可以成功模拟相应的功能，并最终通过自动化测试。

准备

对应课程

很遗憾的是这次的Architecture Lab作业，貌似在上述介绍到的B站课程中，并没有覆盖到第4章（处理器体系结构）的内容，而是直接跳转到了第5章（优化程序性能），对应B站网课P10。

P.S. 别看我，B站没有第4章，CMU自己放出的原版视频也没有第4章 :(

我的建议是由于本次试验涉及Y86-64相关汇编语言的操作，对Y86进行HCL描述，以及流水线和程序优化等知识。直接去看第5章网课可能由于对流水线等知识点不熟悉而产生困惑，建议还是先把书本第4章自己看完。

课程文件

相关的作业可以在CMU的官网上找到：

Lab Assignments

在Architecture Lab一栏里面，我们可以查看相关文件，例如：

README文档
Writeup PDF说明材料
自学材料最重要的作业文件Self-Study Handout

下载后并解压的文件如图所示：

使用环境

运行环境，保险起见还是使用Linux，如下为我目前使用的Linux版本和GCC对应版本：

前期准备

我们首先对sim.tar解压，进入到sim目录下，此目录下存储着Y86-64的相关工具。所有的相关工作都在此文件夹下开展。

在此文件夹下我们执行make clean; make，完成Y86-64工具的编译。

P.S. 由于我们的GCC版本太高，中途可能会遇到许多奇奇怪怪的问题，毕竟这个Lab从16年之后就再没更新过了。

如果遇到multiple definition of 'lineno'; yas-grammar.o:(.bss+0x0): first defined here 这样的错误，我们需要分别进入misc和pipe目录下，修改内部的Makefile，添加参数-fcommon。相关解决方案在Stack overflow上：Fail to compile the Y86 simulatur (CSAPP)

如果遇到/usr/bin/ld: cannot find -ltk: No such file or directory相关报错，可能是缺少组件。在Arch Linux下，通过pacman -S tk即可解决。

每次修改完之后，我们退回到sim目录下，重新执行make clean; make。

成功编译结果如下：

题目

Part A

在本部分中，我们将尝试书写一些Y86-64程序，并熟悉Y86-64相关工具。

本部分我们在文件夹sim/misc下进行，我们的目标是根据examples.c中的代码完成相应的Y86-64程序的书写。

examples.c内容如下所示：

/*
 * Architecture Lab: Part A
 *
 * High level specs for the functions that the students will rewrite
 * in Y86-64 assembly language
 */

/* $begin examples */
/* linked list element */
typedef struct ELE
{
    long val;
    struct ELE *next;
} * list_ptr;

/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
    long val = 0;
    while (ls)
    {
        val += ls->val;
        ls = ls->next;
    }
    return val;
}

/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
    if (!ls)
        return 0;
    else
    {
        long val = ls->val;
        long rest = rsum_list(ls->next);
        return val + rest;
    }
}

/* copy_block - Copy src to dest and return xor checksum of src */
long copy_block(long *src, long *dest, long len)
{
    long result = 0;
    while (len > 0)
    {
        long val = *src++;
        *dest++ = val;
        result ^= val;
        len--;
    }
    return result;
}
/* $end examples */

在创建了对应的ys程序后，相应的我们使用YAS汇编器输出对应的机器代码，并使用YIS指令模拟器模拟Y86-64机器代码程序的执行。

sum.ys 迭代地对链表元素进行相加

本阶段，我们需要根据examples.c中的sum_list()方法，使用Y86-64完成对应的功能。

我们对照着看一下sum_list要实现的功能：

typedef struct ELE
{
    long val;
    struct ELE *next;
} * list_ptr;

/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
    long val = 0;
    while (ls)
    {
        val += ls->val;
        ls = ls->next;
    }
    return val;
}

此外在PDF中，还给出了示例链表，希望我们将该链表代入作为测试：

# Sample linked list
.align 8
ele1:
	.quad 0x00a
	.quad ele2
ele2:
	.quad 0x0b0
	.quad ele3
ele3:
	.quad 0xc00
	.quad 0

代码根据第4章 4.1.5提供的相关代码，大致写作如下：

# Excution begins at address 0
	.pos 0
	irmovq stack, %rsp	# Set up stack pointer
	call main	# Excute main program
	halt	# Terminate program

# Sample linked list
.align 8
ele1:
	.quad 0x00a
	.quad ele2
ele2:
	.quad 0x0b0
	.quad ele3
ele3:
	.quad 0xc00
	.quad 0

# main function
main:
	irmovq ele1, %rdi
	call sum_list	# sum_list(ele1)
	ret

# sum_list function
sum_list:
	irmovq $0, %rax	# val = 0
loop:
	andq %rdi, %rdi # 判断rdi地址是否有效
  je out
  mrmovq 0(%rdi), %rsi # rsi = *(rdi)
  addq %rsi, %rax # rax += rsi
  mrmovq 8(%rdi), %rdi #  rdi = *(rdi + 8)
  jmp loop
out:
	ret
	
# Stack starts here and grows to lower addresses
	.pos 0x200
stack:

完成了对应程序的编写，我们带入到Y86-64模拟器中尝试运行：

有兴趣的话还可以看一下生成的sum.yo的内容，左侧对应了相关的字节码：

                            | # Excution begins at address 0
0x000:                      | 	.pos 0
0x000: 30f40002000000000000 | 	irmovq stack, %rsp	# Set up stack pointer
0x00a: 804800000000000000   | 	call main	# Excute main program
0x013: 00                   | 	halt	# Terminate program
                            | 
                            | # Sample linked list
0x018:                      | .align 8
0x018:                      | ele1:
0x018: 0a00000000000000     | 	.quad 0x00a
0x020: 2800000000000000     | 	.quad ele2
0x028:                      | ele2:
0x028: b000000000000000     | 	.quad 0x0b0
0x030: 3800000000000000     | 	.quad ele3
0x038:                      | ele3:
0x038: 000c000000000000     | 	.quad 0xc00
0x040: 0000000000000000     | 	.quad 0
                            | 
                            | # main function
0x048:                      | main:
0x048: 30f71800000000000000 | 	irmovq ele1, %rdi
0x052: 805c00000000000000   | 	call sum_list	# sum_list(ele1)
0x05b: 90                   | 	ret
                            | 
                            | # sum_list function
0x05c:                      | sum_list:
0x05c: 30f00000000000000000 | 	irmovq $0, %rax	# val = 0
0x066:                      | loop:
0x066: 6277                 | 	andq %rdi, %rdi # 判断rdi地址是否有效
0x068: 739000000000000000   |   je out
0x071: 50670000000000000000 |   mrmovq 0(%rdi), %rsi # rsi = *(rdi)
0x07b: 6060                 |   addq %rsi, %rax # rax += rsi
0x07d: 50770800000000000000 |   mrmovq 8(%rdi), %rdi #  rdi = *(rdi + 8)
0x087: 706600000000000000   |   jmp loop
0x090:                      | out:
0x090: 90                   | 	ret
                            | 	
                            | # Stack starts here and grows to lower addresses
0x200:                      | 	.pos 0x200
0x200:                      | stack:

最终rax输出结果为0xcba，确实为0xa + 0xb0 + 0xc00的计算结果，验证成功。

此外我们还能发现由于使用call指令，对栈空间发生的改变。

rsum.ys 递归地对链表元素进行相加

我们首先看一下要求的功能：

typedef struct ELE
{
    long val;
    struct ELE *next;
} * list_ptr;

/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
    if (!ls)
        return 0;
    else
    {
        long val = ls->val;
        long rest = rsum_list(ls->next);
        return val + rest;
    }
}

测试用的数据结构同上sum_list，因此与上方类似的，我们尝试写下如下的代码：

# Excution begins at address 0
	.pos 0
	irmovq stack, %rsp	# Set up stack pointer
	call main	# Excute main program
	halt	# Terminate program

# Sample linked list
.align 8
ele1:
	.quad 0x00a
	.quad ele2
ele2:
	.quad 0x0b0
	.quad ele3
ele3:
	.quad 0xc00
	.quad 0

# main function
main:
	irmovq ele1, %rdi
	call rsum_list	# rsum_list(ele1)
	ret

# rsum_list function
rsum_list:
	pushq %rbx
	irmovq $0, %rax 
	andq %rdi, %rdi
	jne L1	# 如果节点为nil，直接返回0
	irmovq $0, %rax
	jmp L2
L1:
	mrmovq (%rdi), %rbx
	mrmovq 8(%rdi), %rdi	# rdi = rdi -> next
	call rsum_list
	addq %rbx, %rax	# node.val + rsum_list(node.next)
L2:
	popq	%rbx
	ret
	
# Stack starts here and grows to lower addresses
	.pos 0x200
stack:

注意到我们使用的rbx寄存器用作数据存储，遵循被调用者使用规则，简单说就是随便用，调用子函数之前要备份它，以防他被修改。而对于一个call指令前后，rbx寄存器的值应当是一致的，中间存在入栈保存现场和出栈恢复现场的过程。

代入模拟器中尝试运行：

此外rsum.yo的内容如下，仅作参考：

                            | # Excution begins at address 0
0x000:                      | 	.pos 0
0x000: 30f40002000000000000 | 	irmovq stack, %rsp	# Set up stack pointer
0x00a: 804800000000000000   | 	call main	# Excute main program
0x013: 00                   | 	halt	# Terminate program
                            | 
                            | # Sample linked list
0x018:                      | .align 8
0x018:                      | ele1:
0x018: 0a00000000000000     | 	.quad 0x00a
0x020: 2800000000000000     | 	.quad ele2
0x028:                      | ele2:
0x028: b000000000000000     | 	.quad 0x0b0
0x030: 3800000000000000     | 	.quad ele3
0x038:                      | ele3:
0x038: 000c000000000000     | 	.quad 0xc00
0x040: 0000000000000000     | 	.quad 0
                            | 
                            | # main function
0x048:                      | main:
0x048: 30f71800000000000000 | 	irmovq ele1, %rdi
0x052: 805c00000000000000   | 	call rsum_list	# rsum_list(ele1)
0x05b: 90                   | 	ret
                            | 
                            | # rsum_list function
0x05c:                      | rsum_list:
0x05c: a03f                 | 	pushq %rbx
0x05e: 30f00000000000000000 | 	irmovq $0, %rax 
0x068: 6277                 | 	andq %rdi, %rdi
0x06a: 748600000000000000   | 	jne L1	# 如果节点为nil，直接返回0
0x073: 30f00000000000000000 | 	irmovq $0, %rax
0x07d: 70a500000000000000   | 	jmp L2
0x086:                      | L1:
0x086: 50370000000000000000 | 	mrmovq (%rdi), %rbx
0x090: 50770800000000000000 | 	mrmovq 8(%rdi), %rdi	# rdi = rdi -> next
0x09a: 805c00000000000000   | 	call rsum_list
0x0a3: 6030                 | 	addq %rbx, %rax	# node.val + rsum_list(node.next)
0x0a5:                      | L2:
0x0a5: b03f                 | 	popq	%rbx
0x0a7: 90                   | 	ret
                            | 	
                            | # Stack starts here and grows to lower addresses
0x200:                      | 	.pos 0x200
0x200:                      | stack:

结果同上，也是正确的。此外还可以观察到由于我们这次是使用递归调用的，因此使用了更多的栈空间。

copy.ys 复制数据块

我们先看C的实现逻辑：

/* copy_block - Copy src to dest and return xor checksum of src */
long copy_block(long *src, long *dest, long len)
{
    long result = 0;
    while (len > 0)
    {
        long val = *src++;
        *dest++ = val;
        result ^= val;
        len--;
    }
    return result;
}

不难看出，copy_block传入的前两个参数为两个long数组，分别表示了一一对应的源地址和目标地址，希望我们将从数据块从src复制到dest。

PDF中提供了如下的测试数据：

.align 8
# Source block
src:
	.quad 0x00a
	.quad 0x0b0
	.quad 0xc00
# Destination block
dest:
	.quad 0x111
	.quad 0x222
	.quad 0x333

我们根据代码写出如下代码：

# Excution begins at address 0
	.pos 0
	irmovq stack, %rsp	# Set up stack pointer
	call main	# Excute main program
	halt	# Terminate program

.align 8
# Source block
src:
	.quad 0x00a
	.quad 0x0b0
	.quad 0xc00
# Destination block
dest:
	.quad 0x111
	.quad 0x222
	.quad 0x333

# main function
main:
	irmovq src, %rdi
	irmovq dest, %rsi
	irmovq $3, %rdx
	call copy_block # copy_block(src, dest, 3)
	ret

# copy_block function
copy_block:
	pushq %r8
	pushq %r9
	pushq %rcx
	irmovq $8, %r8
	irmovq $1, %r9
	irmovq $0, %rax
L1:
	andq %rdx, %rdx
	je L2
	mrmovq 0(%rdi), %rcx
	addq %r8, %rdi
	rmmovq %rcx, (%rsi)
	addq %r8, %rsi
	xorq %rcx, %rax
	subq %r9, %rdx
	jmp L1
L2:
	popq %r8
	popq %r9
	popq %rcx
	ret
	
# Stack starts here and grows to lower addresses
	.pos 0x200
stack:

结果如下：

经过验证0xa ^ 0xb0 ^ 0xc00结果确实为0xcba，结果成立。

copy.yo内容如下，仅供参考：

                            | # Excution begins at address 0
0x000:                      | 	.pos 0	
0x000: 30f40002000000000000 | 	irmovq stack, %rsp	# Set up stack pointer
0x00a: 804800000000000000   | 	call main	# Excute main program
0x013: 00                   | 	halt	# Terminate program
                            | 
0x018:                      | .align 8
                            | # Source block
0x018:                      | src:
0x018: 0a00000000000000     | 	.quad 0x00a
0x020: b000000000000000     | 	.quad 0x0b0
0x028: 000c000000000000     | 	.quad 0xc00
                            | # Destination block
0x030:                      | dest:
0x030: 1101000000000000     | 	.quad 0x111
0x038: 2202000000000000     | 	.quad 0x222
0x040: 3303000000000000     | 	.quad 0x333
                            | 
                            | # main function
0x048:                      | main:
0x048: 30f71800000000000000 | 	irmovq src, %rdi
0x052: 30f63000000000000000 | 	irmovq dest, %rsi
0x05c: 30f20300000000000000 | 	irmovq $3, %rdx
0x066: 807000000000000000   | 	call copy_block # copy_block(src, dest, 3)
0x06f: 90                   | 	ret
                            | 
                            | # copy_block function
0x070:                      | copy_block:
0x070: a08f                 | 	pushq %r8
0x072: a09f                 | 	pushq %r9
0x074: a01f                 | 	pushq %rcx
0x076: 30f80800000000000000 | 	irmovq $8, %r8
0x080: 30f90100000000000000 | 	irmovq $1, %r9
0x08a: 30f00000000000000000 | 	irmovq $0, %rax
0x094:                      | L1:
0x094: 6222                 | 	andq %rdx, %rdx
0x096: 73c400000000000000   | 	je L2
0x09f: 50170000000000000000 | 	mrmovq 0(%rdi), %rcx
0x0a9: 6087                 | 	addq %r8, %rdi
0x0ab: 40160000000000000000 | 	rmmovq %rcx, (%rsi)
0x0b5: 6086                 | 	addq %r8, %rsi
0x0b7: 6310                 | 	xorq %rcx, %rax
0x0b9: 6192                 | 	subq %r9, %rdx
0x0bb: 709400000000000000   | 	jmp L1
0x0c4:                      | L2:
0x0c4: b08f                 | 	popq %r8
0x0c6: b09f                 | 	popq %r9
0x0c8: b01f                 | 	popq %rcx
0x0ca: 90                   | 	ret
                            | 	
                            | # Stack starts here and grows to lower addresses
0x200:                      | 	.pos 0x200
0x200:                      | stack:

Part B

在本部分，我们在文件夹sim/seq下来进行。

该部分内容主要是为SEQ处理器添加指令IADDQ，所要修改的文件为seq-full.hcl。

由于iaddq指令既与运算操作相关，又与立即数处理相关，故该指令的功能添加可以参考seq-full.hcl中的IOPQ以及IIRMOVQ来编写。

iaddq指令相关要求，详见CS:APP第三版中文版P328家庭作业4.51以及4.52，P253 练习题4.3。

sql-full.hcl目前内容如下：

#/* $begin seq-all-hcl */
####################################################################
#  HCL Description of Control for Single Cycle Y86-64 Processor SEQ   #
#  Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2010       #
####################################################################

## Your task is to implement the iaddq instruction
## The file contains a declaration of the icodes
## for iaddq (IIADDQ)
## Your job is to add the rest of the logic to make it work

####################################################################
#    C Include's.  Don't alter these                               #
####################################################################

quote '#include <stdio.h>'
quote '#include "isa.h"'
quote '#include "sim.h"'
quote 'int sim_main(int argc, char *argv[]);'
quote 'word_t gen_pc(){return 0;}'
quote 'int main(int argc, char *argv[])'
quote '  {plusmode=0;return sim_main(argc,argv);}'

####################################################################
#    Declarations.  Do not change/remove/delete any of these       #
####################################################################

##### Symbolic representation of Y86-64 Instruction Codes #############
wordsig INOP 	'I_NOP'
wordsig IHALT	'I_HALT'
wordsig IRRMOVQ	'I_RRMOVQ'
wordsig IIRMOVQ	'I_IRMOVQ'
wordsig IRMMOVQ	'I_RMMOVQ'
wordsig IMRMOVQ	'I_MRMOVQ'
wordsig IOPQ	'I_ALU'
wordsig IJXX	'I_JMP'
wordsig ICALL	'I_CALL'
wordsig IRET	'I_RET'
wordsig IPUSHQ	'I_PUSHQ'
wordsig IPOPQ	'I_POPQ'
# Instruction code for iaddq instruction
wordsig IIADDQ	'I_IADDQ'

##### Symbolic represenations of Y86-64 function codes                  #####
wordsig FNONE    'F_NONE'        # Default function code

##### Symbolic representation of Y86-64 Registers referenced explicitly #####
wordsig RRSP     'REG_RSP'    	# Stack Pointer
wordsig RNONE    'REG_NONE'   	# Special value indicating "no register"

##### ALU Functions referenced explicitly                            #####
wordsig ALUADD	'A_ADD'		# ALU should add its arguments

##### Possible instruction status values                             #####
wordsig SAOK	'STAT_AOK'	# Normal execution
wordsig SADR	'STAT_ADR'	# Invalid memory address
wordsig SINS	'STAT_INS'	# Invalid instruction
wordsig SHLT	'STAT_HLT'	# Halt instruction encountered

##### Signals that can be referenced by control logic ####################

##### Fetch stage inputs		#####
wordsig pc 'pc'				# Program counter
##### Fetch stage computations		#####
wordsig imem_icode 'imem_icode'		# icode field from instruction memory
wordsig imem_ifun  'imem_ifun' 		# ifun field from instruction memory
wordsig icode	  'icode'		# Instruction control code
wordsig ifun	  'ifun'		# Instruction function
wordsig rA	  'ra'			# rA field from instruction
wordsig rB	  'rb'			# rB field from instruction
wordsig valC	  'valc'		# Constant from instruction
wordsig valP	  'valp'		# Address of following instruction
boolsig imem_error 'imem_error'		# Error signal from instruction memory
boolsig instr_valid 'instr_valid'	# Is fetched instruction valid?

##### Decode stage computations		#####
wordsig valA	'vala'			# Value from register A port
wordsig valB	'valb'			# Value from register B port

##### Execute stage computations	#####
wordsig valE	'vale'			# Value computed by ALU
boolsig Cnd	'cond'			# Branch test

##### Memory stage computations		#####
wordsig valM	'valm'			# Value read from memory
boolsig dmem_error 'dmem_error'		# Error signal from data memory


####################################################################
#    Control Signal Definitions.                                   #
####################################################################

################ Fetch Stage     ###################################

# Determine instruction code
word icode = [
	imem_error: INOP;
	1: imem_icode;		# Default: get from instruction memory
];

# Determine instruction function
word ifun = [
	imem_error: FNONE;
	1: imem_ifun;		# Default: get from instruction memory
];

bool instr_valid = icode in 
	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
	       IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };

# Does fetched instruction require a regid byte?
bool need_regids =
	icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, 
		     IIRMOVQ, IRMMOVQ, IMRMOVQ };

# Does fetched instruction require a constant word?
bool need_valC =
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL };

################ Decode Stage    ###################################

## What register should be used as the A source?
word srcA = [
	icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ  } : rA;
	icode in { IPOPQ, IRET } : RRSP;
	1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word srcB = [
	icode in { IOPQ, IRMMOVQ, IMRMOVQ  } : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't need register
];

## What register should be used as the E destination?
word dstE = [
	icode in { IRRMOVQ } && Cnd : rB;
	icode in { IIRMOVQ, IOPQ} : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't write any register
];

## What register should be used as the M destination?
word dstM = [
	icode in { IMRMOVQ, IPOPQ } : rA;
	1 : RNONE;  # Don't write any register
];

################ Execute Stage   ###################################

## Select input A to ALU
word aluA = [
	icode in { IRRMOVQ, IOPQ } : valA;
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : valC;
	icode in { ICALL, IPUSHQ } : -8;
	icode in { IRET, IPOPQ } : 8;
	# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
	icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, 
		      IPUSHQ, IRET, IPOPQ } : valB;
	icode in { IRRMOVQ, IIRMOVQ } : 0;
	# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
	icode == IOPQ : ifun;
	1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = icode in { IOPQ };

################ Memory Stage    ###################################

## Set read control signal
bool mem_read = icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = icode in { IRMMOVQ, IPUSHQ, ICALL };

## Select memory address
word mem_addr = [
	icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : valE;
	icode in { IPOPQ, IRET } : valA;
	# Other instructions don't need address
];

## Select memory input data
word mem_data = [
	# Value from register
	icode in { IRMMOVQ, IPUSHQ } : valA;
	# Return PC
	icode == ICALL : valP;
	# Default: Don't write anything
];

## Determine instruction status
word Stat = [
	imem_error || dmem_error : SADR;
	!instr_valid: SINS;
	icode == IHALT : SHLT;
	1 : SAOK;
];

################ Program Counter Update ############################

## What address should instruction be fetched at

word new_pc = [
	# Call.  Use instruction constant
	icode == ICALL : valC;
	# Taken branch.  Use instruction constant
	icode == IJXX && Cnd : valC;
	# Completion of RET instruction.  Use value from stack
	icode == IRET : valM;
	# Default: Use incremented PC
	1 : valP;
];
#/* $end seq-all-hcl */

在头上的注释中已经表明，在程序内已经声明了iaddq(IIADDQ)的相关icode，我们需要做的是补全相关的逻辑，使其工作。

首先，新指令iaddq格式如下，仅作参考：

其实现的功能，简单来说也就是将常数V直接加到寄存器rB中。

首先，我们不妨模仿书上的例子，将iaddq指令对应的6个阶段梳理一下：

取指

$icode:ifun \gets M_1[PC]$

$rA:rB \gets M_1[PC+1]$

$valC \gets M_8[PC+2]$

$valP \gets PC + 10$
译码

$valB \gets R[rB]$
执行

$valE \gets valB + valC$
访存

无操作
写回

$R[rB] \gets valE$
更新PC

$PC \gets valP$

取指阶段 Fetch Stage

取指阶段中，有icode, ifun, instr_valid, need_regids和need_valC共5个控制逻辑块。

icode和ifun控制块不用多说，是用来从指令内存中读取逻辑块计算指令和功能码的，不必我们进行修改。

instr_valid表示这个字节是否对应于一个合法的Y86-64指令，我们通过这个信号发现不合法的指令。因此，我们需要将IIADDQ加入到其中去，表示一个合法的指令。

need_regids表示指令中是否包括一个寄存器指示符字节，很显然，根据刚刚给出的iaddq指令格式，我们显然看到了寄存器指示符字节，由0xF和rB组成。因此我们需要IIADDQ加入其中。

need_valC表示指令是否包含一个常数。同上，我们观察到，iaddq指令在其2~9字节存储了一个常数，也就是我们所说的常数V，即valC。因此，需要加入IIADDQ指令。

综上，我们取指部分最终的结果为：

################ Fetch Stage     ###################################

# Determine instruction code
word icode = [
	imem_error: INOP;
	1: imem_icode;		# Default: get from instruction memory
];

# Determine instruction function
word ifun = [
	imem_error: FNONE;
	1: imem_ifun;		# Default: get from instruction memory
];

bool instr_valid = icode in 
	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
	       IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };

# Does fetched instruction require a regid byte?
bool need_regids =
	icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, 
		     IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };

# Does fetched instruction require a constant word?
bool need_valC =
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

译码和写回阶段 Decode&Writeback Stage

在译码和写回阶段中，共有srcA, srcB, dstE和dstM，共4个控制逻辑块。在这里，我们处理的是寄存器的读写逻辑。

其中srcA和srcB用于控制选择两个读端口的地址输入，dstE和dstM控制选择写端口的地址输入。

很明显的，在iaddq指令中，我们需要对寄存器rB进行读写操作。因此，我们需要对指令IIADDQ指定读取访问寄存器为rB，写入目标寄存器为rB。此外，还注意到IIADDQ和IRRMOVQ指令不同，并不需要条件Cnd信号控制。

因此，我们最终此部分的逻辑为：

################ Decode Stage    ###################################

## What register should be used as the A source?
word srcA = [
	icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ  } : rA;
	icode in { IPOPQ, IRET } : RRSP;
	1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word srcB = [
	icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ  } : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't need register
];

## What register should be used as the E destination?
word dstE = [
	icode in { IRRMOVQ } && Cnd : rB;
	icode in { IIRMOVQ, IOPQ, IIADDQ} : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't write any register
];

## What register should be used as the M destination?
word dstM = [
	icode in { IMRMOVQ, IPOPQ } : rA;
	1 : RNONE;  # Don't write any register
];

执行阶段 Excuse Stage

在执行阶段中，涉及到aluA, aluB, alufun和set_cc，共4个控制逻辑块。

aluA和aluB控制块用于控制选择两个进入ALU算术/逻辑单元的数据。其中列出的操作数aluB在前，aluA在后，为了保证例如subq指令中是valB - valA，而不是相反操作。我们发现对于IIADDQ指令，需要进行的操作是valB + valC，因此我们将aluB指向valB，将aluA指向常数valC。

ALU在执行阶段进行的操作，通常作为加法器来进行使用，但是对于OPq指令（例如与或非运算），有时我们希望使用指令ifun字段中编码的操作，因此我们引入alufun控制块。对于IIADDQ指令，并不需要我们进行修改，默认加法即可。

每次计算时，还有可能根据计算的结果调整条件码寄存器CC的内容（例如之前提及的CF, ZF, SF等寄存器），因此我们引入set_cc控制块来规定是否更新条件码寄存器。对于IIADDQ指令，应当更新。

因此，修改后的结果为：

################ Execute Stage   ###################################

## Select input A to ALU
word aluA = [
	icode in { IRRMOVQ, IOPQ } : valA;
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
	icode in { ICALL, IPUSHQ } : -8;
	icode in { IRET, IPOPQ } : 8;
	# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
	icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, 
		      IPUSHQ, IRET, IPOPQ, IIADDQ } : valB;
	icode in { IRRMOVQ, IIRMOVQ } : 0;
	# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
	icode == IOPQ : ifun;
	1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = icode in { IOPQ, IIADDQ };

访存阶段 Memory Stage

访存阶段的任务是根据给定的内存地址对内存进行读写，其中有mem_read, mem_write, mem_addr, mem_data和Stat，共5个控制块。

mem_read表示是否只从内存读取数据的控制信号，同理mem_read表示是否只向内存写入数据的控制信号。

mem_addr控制块用来控制选择地址数据的来源（可以选择valE或者valA）。

最后Stat控制块根据取指阶段产生的icode, imem_error, instr_valid以及数据内存产生的dmem_error信号，从指令执行的结果来计算状态码Stat。

在IIADDQ指令中，并不存在和访存相关的操作，因此不需修改，维持原状：

################ Memory Stage    ###################################

## Set read control signal
bool mem_read = icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = icode in { IRMMOVQ, IPUSHQ, ICALL };

## Select memory address
word mem_addr = [
	icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : valE;
	icode in { IPOPQ, IRET } : valA;
	# Other instructions don't need address
];

## Select memory input data
word mem_data = [
	# Value from register
	icode in { IRMMOVQ, IPUSHQ } : valA;
	# Return PC
	icode == ICALL : valP;
	# Default: Don't write anything
];

## Determine instruction status
word Stat = [
	imem_error || dmem_error : SADR;
	!instr_valid: SINS;
	icode == IHALT : SHLT;
	1 : SAOK;
];

更新PC阶段 Program Counter Update

在本阶段中，我们完成对程序计数器的更新。使用了一个new_pc的控制逻辑块。

程序计数器指向下一步执行的程序地址，正常情况下如果是按照顺序顺沿，应该就是增加过后的valP。但是有时可能通过jXX等跳转指令实现跳转到其他位置，地址的来源可能是内存中读取的valM，也可能是jXX命令中带有的常数valC。

new_pc的功能就是根据需要，通过分析icode（指令的类型）和Cnd的值（jXX指令的跳转条件），对valC, valM, valP进行选择控制。

在这里，我们只需要保持原状，让程序计数器顺沿更新为valP即可。

################ Program Counter Update ############################

## What address should instruction be fetched at

word new_pc = [
	# Call.  Use instruction constant
	icode == ICALL : valC;
	# Taken branch.  Use instruction constant
	icode == IJXX && Cnd : valC;
	# Completion of RET instruction.  Use value from stack
	icode == IRET : valM;
	# Default: Use incremented PC
	1 : valP;
];

测试

我们在sim/seq目录下，进行编译。使用命令make clean; make ssim VERSION=full。

由于tcl内部有部分数据结构已经被弃用，编译时可能会爆出错误psim.c:853:8: error: ‘Tcl_Interp’ {aka ‘struct Tcl_Interp’} has no member named ‘result’，详见Stack overflow，最简单的处理办法就是在Makefile上的CFLAGS上添加参数-DUSE_INTERP_RESULT。

此外还有可能遇到编译问题/usr/bin/ld: /tmp/ccHA7fEj.o:(.data.rel+0x0): undefined reference to matherr，详见Stack overflow - Fail to build y86-64 simulator from sources。目测可以直接将ssim.c中与matherr相关的代码注释即可，如下：

接下来，我们就成功的编译了模拟器。

我们尝试对刚才的hcl文件进行简单的Y86-64程序测试：

显示通过测试。

接下来我们进行更加全面的基准测试：

显示测试全数通过。

接下来，我们进行回归测试：

通过测试。

Part C

在本部分，我们在文件夹sim/pipe下来进行。

题目中，给定了ncopy函数的C代码：

/*
 * ncopy - copy src to dst, returning number of positive ints
 * contained in src array.
 */
word_t ncopy(word_t *src, word_t *dst, word_t len) {
  word_t count = 0;
  word_t val;
  
  while(len > 0) {
    val = *src++;
    *dst++ = val;
    if (val > 0)
      count++;
    len--;
  }
  return count;
}

功能为，将 src 数组复制到 dest 数组，并返回数组中的正数的总数。

此外，题目还提供了ncopy的Y86-64代码ncopy.ys：

#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
	# Loop header
	xorq %rax,%rax		# count = 0;
	andq %rdx,%rdx		# len <= 0?
	jle Done		# if so, goto Done:

Loop:	mrmovq (%rdi), %r10	# read val from src...
	rmmovq %r10, (%rsi)	# ...and store it to dst
	andq %r10, %r10		# val <= 0?
	jle Npos		# if so, goto Npos:
	irmovq $1, %r10
	addq %r10, %rax		# count++
Npos:	irmovq $1, %r10
	subq %r10, %rdx		# len--
	irmovq $8, %r10
	addq %r10, %rdi		# src++
	addq %r10, %rsi		# dst++
	andq %rdx,%rdx		# len > 0?
	jg Loop			# if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

同Part B中相关内容，题目首先要求我们在pipe-full.hcl文件中实现iaddq指令。

然后通过修改的方式对ncopy.ys汇编文件和进行pipe-full.hcl效率优化。

实现IIADDQ

修改前的pipe-full.hcl如下所示，其中在文件中只是简单的对IIADDQ进行了声明。

#/* $begin pipe-all-hcl */
####################################################################
#    HCL Description of Control for Pipelined Y86-64 Processor     #
#    Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2014     #
####################################################################

## Your task is to implement the iaddq instruction
## The file contains a declaration of the icodes
## for iaddq (IIADDQ)
## Your job is to add the rest of the logic to make it work

####################################################################
#    C Include's.  Don't alter these                               #
####################################################################

quote '#include <stdio.h>'
quote '#include "isa.h"'
quote '#include "pipeline.h"'
quote '#include "stages.h"'
quote '#include "sim.h"'
quote 'int sim_main(int argc, char *argv[]);'
quote 'int main(int argc, char *argv[]){return sim_main(argc,argv);}'

####################################################################
#    Declarations.  Do not change/remove/delete any of these       #
####################################################################

##### Symbolic representation of Y86-64 Instruction Codes #############
wordsig INOP 	'I_NOP'
wordsig IHALT	'I_HALT'
wordsig IRRMOVQ	'I_RRMOVQ'
wordsig IIRMOVQ	'I_IRMOVQ'
wordsig IRMMOVQ	'I_RMMOVQ'
wordsig IMRMOVQ	'I_MRMOVQ'
wordsig IOPQ	'I_ALU'
wordsig IJXX	'I_JMP'
wordsig ICALL	'I_CALL'
wordsig IRET	'I_RET'
wordsig IPUSHQ	'I_PUSHQ'
wordsig IPOPQ	'I_POPQ'
# Instruction code for iaddq instruction
wordsig IIADDQ	'I_IADDQ'

##### Symbolic represenations of Y86-64 function codes            #####
wordsig FNONE    'F_NONE'        # Default function code

##### Symbolic representation of Y86-64 Registers referenced      #####
wordsig RRSP     'REG_RSP'    	     # Stack Pointer
wordsig RNONE    'REG_NONE'   	     # Special value indicating "no register"

##### ALU Functions referenced explicitly ##########################
wordsig ALUADD	'A_ADD'		     # ALU should add its arguments

##### Possible instruction status values                       #####
wordsig SBUB	'STAT_BUB'	# Bubble in stage
wordsig SAOK	'STAT_AOK'	# Normal execution
wordsig SADR	'STAT_ADR'	# Invalid memory address
wordsig SINS	'STAT_INS'	# Invalid instruction
wordsig SHLT	'STAT_HLT'	# Halt instruction encountered

##### Signals that can be referenced by control logic ##############

##### Pipeline Register F ##########################################

wordsig F_predPC 'pc_curr->pc'	     # Predicted value of PC

##### Intermediate Values in Fetch Stage ###########################

wordsig imem_icode  'imem_icode'      # icode field from instruction memory
wordsig imem_ifun   'imem_ifun'       # ifun  field from instruction memory
wordsig f_icode	'if_id_next->icode'  # (Possibly modified) instruction code
wordsig f_ifun	'if_id_next->ifun'   # Fetched instruction function
wordsig f_valC	'if_id_next->valc'   # Constant data of fetched instruction
wordsig f_valP	'if_id_next->valp'   # Address of following instruction
boolsig imem_error 'imem_error'	     # Error signal from instruction memory
boolsig instr_valid 'instr_valid'    # Is fetched instruction valid?

##### Pipeline Register D ##########################################
wordsig D_icode 'if_id_curr->icode'   # Instruction code
wordsig D_rA 'if_id_curr->ra'	     # rA field from instruction
wordsig D_rB 'if_id_curr->rb'	     # rB field from instruction
wordsig D_valP 'if_id_curr->valp'     # Incremented PC

##### Intermediate Values in Decode Stage  #########################

wordsig d_srcA	 'id_ex_next->srca'  # srcA from decoded instruction
wordsig d_srcB	 'id_ex_next->srcb'  # srcB from decoded instruction
wordsig d_rvalA 'd_regvala'	     # valA read from register file
wordsig d_rvalB 'd_regvalb'	     # valB read from register file

##### Pipeline Register E ##########################################
wordsig E_icode 'id_ex_curr->icode'   # Instruction code
wordsig E_ifun  'id_ex_curr->ifun'    # Instruction function
wordsig E_valC  'id_ex_curr->valc'    # Constant data
wordsig E_srcA  'id_ex_curr->srca'    # Source A register ID
wordsig E_valA  'id_ex_curr->vala'    # Source A value
wordsig E_srcB  'id_ex_curr->srcb'    # Source B register ID
wordsig E_valB  'id_ex_curr->valb'    # Source B value
wordsig E_dstE 'id_ex_curr->deste'    # Destination E register ID
wordsig E_dstM 'id_ex_curr->destm'    # Destination M register ID

##### Intermediate Values in Execute Stage #########################
wordsig e_valE 'ex_mem_next->vale'	# valE generated by ALU
boolsig e_Cnd 'ex_mem_next->takebranch' # Does condition hold?
wordsig e_dstE 'ex_mem_next->deste'      # dstE (possibly modified to be RNONE)

##### Pipeline Register M                  #########################
wordsig M_stat 'ex_mem_curr->status'     # Instruction status
wordsig M_icode 'ex_mem_curr->icode'	# Instruction code
wordsig M_ifun  'ex_mem_curr->ifun'	# Instruction function
wordsig M_valA  'ex_mem_curr->vala'      # Source A value
wordsig M_dstE 'ex_mem_curr->deste'	# Destination E register ID
wordsig M_valE  'ex_mem_curr->vale'      # ALU E value
wordsig M_dstM 'ex_mem_curr->destm'	# Destination M register ID
boolsig M_Cnd 'ex_mem_curr->takebranch'	# Condition flag
boolsig dmem_error 'dmem_error'	        # Error signal from instruction memory

##### Intermediate Values in Memory Stage ##########################
wordsig m_valM 'mem_wb_next->valm'	# valM generated by memory
wordsig m_stat 'mem_wb_next->status'	# stat (possibly modified to be SADR)

##### Pipeline Register W ##########################################
wordsig W_stat 'mem_wb_curr->status'     # Instruction status
wordsig W_icode 'mem_wb_curr->icode'	# Instruction code
wordsig W_dstE 'mem_wb_curr->deste'	# Destination E register ID
wordsig W_valE  'mem_wb_curr->vale'      # ALU E value
wordsig W_dstM 'mem_wb_curr->destm'	# Destination M register ID
wordsig W_valM  'mem_wb_curr->valm'	# Memory M value

####################################################################
#    Control Signal Definitions.                                   #
####################################################################

################ Fetch Stage     ###################################

## What address should instruction be fetched at
word f_pc = [
	# Mispredicted branch.  Fetch at incremented PC
	M_icode == IJXX && !M_Cnd : M_valA;
	# Completion of RET instruction
	W_icode == IRET : W_valM;
	# Default: Use predicted value of PC
	1 : F_predPC;
];

## Determine icode of fetched instruction
word f_icode = [
	imem_error : INOP;
	1: imem_icode;
];

# Determine ifun
word f_ifun = [
	imem_error : FNONE;
	1: imem_ifun;
];

# Is instruction valid?
bool instr_valid = f_icode in 
	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
	  IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };

# Determine status code for fetched instruction
word f_stat = [
	imem_error: SADR;
	!instr_valid : SINS;
	f_icode == IHALT : SHLT;
	1 : SAOK;
];

# Does fetched instruction require a regid byte?
bool need_regids =
	f_icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, 
		     IIRMOVQ, IRMMOVQ, IMRMOVQ };

# Does fetched instruction require a constant word?
bool need_valC =
	f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL };

# Predict next value of PC
word f_predPC = [
	f_icode in { IJXX, ICALL } : f_valC;
	1 : f_valP;
];

################ Decode Stage ######################################


## What register should be used as the A source?
word d_srcA = [
	D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ  } : D_rA;
	D_icode in { IPOPQ, IRET } : RRSP;
	1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word d_srcB = [
	D_icode in { IOPQ, IRMMOVQ, IMRMOVQ  } : D_rB;
	D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't need register
];

## What register should be used as the E destination?
word d_dstE = [
	D_icode in { IRRMOVQ, IIRMOVQ, IOPQ} : D_rB;
	D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't write any register
];

## What register should be used as the M destination?
word d_dstM = [
	D_icode in { IMRMOVQ, IPOPQ } : D_rA;
	1 : RNONE;  # Don't write any register
];

## What should be the A value?
## Forward into decode stage for valA
word d_valA = [
	D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC
	d_srcA == e_dstE : e_valE;    # Forward valE from execute
	d_srcA == M_dstM : m_valM;    # Forward valM from memory
	d_srcA == M_dstE : M_valE;    # Forward valE from memory
	d_srcA == W_dstM : W_valM;    # Forward valM from write back
	d_srcA == W_dstE : W_valE;    # Forward valE from write back
	1 : d_rvalA;  # Use value read from register file
];

word d_valB = [
	d_srcB == e_dstE : e_valE;    # Forward valE from execute
	d_srcB == M_dstM : m_valM;    # Forward valM from memory
	d_srcB == M_dstE : M_valE;    # Forward valE from memory
	d_srcB == W_dstM : W_valM;    # Forward valM from write back
	d_srcB == W_dstE : W_valE;    # Forward valE from write back
	1 : d_rvalB;  # Use value read from register file
];

################ Execute Stage #####################################

## Select input A to ALU
word aluA = [
	E_icode in { IRRMOVQ, IOPQ } : E_valA;
	E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : E_valC;
	E_icode in { ICALL, IPUSHQ } : -8;
	E_icode in { IRET, IPOPQ } : 8;
	# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
	E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, 
		     IPUSHQ, IRET, IPOPQ } : E_valB;
	E_icode in { IRRMOVQ, IIRMOVQ } : 0;
	# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
	E_icode == IOPQ : E_ifun;
	1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = E_icode == IOPQ &&
	# State changes only during normal operation
	!m_stat in { SADR, SINS, SHLT } && !W_stat in { SADR, SINS, SHLT };

## Generate valA in execute stage
word e_valA = E_valA;    # Pass valA through stage

## Set dstE to RNONE in event of not-taken conditional move
word e_dstE = [
	E_icode == IRRMOVQ && !e_Cnd : RNONE;
	1 : E_dstE;
];

################ Memory Stage ######################################

## Select memory address
word mem_addr = [
	M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : M_valE;
	M_icode in { IPOPQ, IRET } : M_valA;
	# Other instructions don't need address
];

## Set read control signal
bool mem_read = M_icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = M_icode in { IRMMOVQ, IPUSHQ, ICALL };

#/* $begin pipe-m_stat-hcl */
## Update the status
word m_stat = [
	dmem_error : SADR;
	1 : M_stat;
];
#/* $end pipe-m_stat-hcl */

## Set E port register ID
word w_dstE = W_dstE;

## Set E port value
word w_valE = W_valE;

## Set M port register ID
word w_dstM = W_dstM;

## Set M port value
word w_valM = W_valM;

## Update processor status
word Stat = [
	W_stat == SBUB : SAOK;
	1 : W_stat;
];

################ Pipeline Register Control #########################

# Should I stall or inject a bubble into Pipeline Register F?
# At most one of these can be true.
bool F_bubble = 0;
bool F_stall =
	# Conditions for a load/use hazard
	E_icode in { IMRMOVQ, IPOPQ } &&
	 E_dstM in { d_srcA, d_srcB } ||
	# Stalling at fetch while ret passes through pipeline
	IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register D?
# At most one of these can be true.
bool D_stall = 
	# Conditions for a load/use hazard
	E_icode in { IMRMOVQ, IPOPQ } &&
	 E_dstM in { d_srcA, d_srcB };

bool D_bubble =
	# Mispredicted branch
	(E_icode == IJXX && !e_Cnd) ||
	# Stalling at fetch while ret passes through pipeline
	# but not condition for a load/use hazard
	!(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
	  IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register E?
# At most one of these can be true.
bool E_stall = 0;
bool E_bubble =
	# Mispredicted branch
	(E_icode == IJXX && !e_Cnd) ||
	# Conditions for a load/use hazard
	E_icode in { IMRMOVQ, IPOPQ } &&
	 E_dstM in { d_srcA, d_srcB};

# Should I stall or inject a bubble into Pipeline Register M?
# At most one of these can be true.
bool M_stall = 0;
# Start injecting bubbles as soon as exception passes through memory stage
bool M_bubble = m_stat in { SADR, SINS, SHLT } || W_stat in { SADR, SINS, SHLT };

# Should I stall or inject a bubble into Pipeline Register W?
bool W_stall = W_stat in { SADR, SINS, SHLT };
bool W_bubble = 0;
#/* $end pipe-all-hcl */

同Part B中的操作相似，我们要将IIADDQ指令整合到下方的逻辑选择电路中去。

小小的区别就是，这次我们处理的是五段式指令流水线PIPE：

iaddq格式同上，我们这里再写一次，如下，仅作参考：

其实现的功能，简单来说也就是将常数V直接加到寄存器rB中。

对应的6个阶段如下：

取指

$icode:ifun \gets M_1[PC]$

$rA:rB \gets M_1[PC+1]$

$valC \gets M_8[PC+2]$

$valP \gets PC + 10$
译码

$valB \gets R[rB]$
执行

$valE \gets valB + valC$
访存

无操作
写回

$R[rB] \gets valE$
更新PC

$PC \gets valP$

取指阶段 Fetch Stage

取指阶段共包含了f_pc, f_icode, f_ifun, instr_valid, f_stat, need_regids, need_valC和f_predPC，共8个控制逻辑块。

f_pc功能为对选择下一条执行的指令地址，默认情况下f_pc应当选择预测的地址，除非遇到分支预测错误即M_icode == IJXX && !M_Cnd，或者ret指令即W_icode == IRET，才应当进行地址修正（与此同时还会伴随暂停和气泡信号的插入，告知后续的电路停止执行指令，防止造成数据的污染）。这里不需要我们针对IIADDQ进行任何修改。

f_icode和f_ifun只有在内存读取异常时才会转为错误码，否则为内存中正常取出的计算指令和功能码。这里不需要我们针对IIADDQ进行任何修改。

f_stat功能为通过分析imem_error, instr_valid等信号，输出取指阶段中指令的临时状态，可输出的信号有SADR, SINS, SHLT和SAOK，这些状态码可以在第三版中文版P277查询到。这里不需要我们针对IIADDQ进行任何修改。

need_valC表示指令是否包含一个常数。同上，我们观察到，iaddq指令在其2~9字节存储了一个常数，也就是我们所说的常数V，即valC。因此，需要加入IIADDQ指令。`

f_predPC功能为选择预测指令地址，即从PC增加后的valP和指令自带常数valC中，选择一个。当指令为IJXX或ICALL等调用时，我们默认预测跳转指令成立，并选择跳转地址valC作为预测指令地址，否则就默认为当前指令的顺沿下一条地址也就是PC增加后的valP。这里不需要我们针对IIADDQ进行任何修改。

因此，修改后的结果为：

################ Fetch Stage     ###################################

## What address should instruction be fetched at
word f_pc = [
	# Mispredicted branch.  Fetch at incremented PC
	M_icode == IJXX && !M_Cnd : M_valA;
	# Completion of RET instruction
	W_icode == IRET : W_valM;
	# Default: Use predicted value of PC
	1 : F_predPC;
];

## Determine icode of fetched instruction
word f_icode = [
	imem_error : INOP;
	1: imem_icode;
];

# Determine ifun
word f_ifun = [
	imem_error : FNONE;
	1: imem_ifun;
];

# Is instruction valid?
bool instr_valid = f_icode in 
	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
	  IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };

# Determine status code for fetched instruction
word f_stat = [
	imem_error: SADR;
	!instr_valid : SINS;
	f_icode == IHALT : SHLT;
	1 : SAOK;
];

# Does fetched instruction require a regid byte?
bool need_regids =
	f_icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, 
		     IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };

# Does fetched instruction require a constant word?
bool need_valC =
	f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

# Predict next value of PC
word f_predPC = [
	f_icode in { IJXX, ICALL } : f_valC;
	1 : f_valP;
];

译码和写回阶段 Decode&Writeback Stage

P.S. 在所有指令中，只有call在访存阶段需要valP的值，只有跳转指令在执行阶段（当不需要跳转时）需要valP的值，而这些指令都不需要从寄存器文件中读出的值，因此我们可以直接合并两个信号，将他们作为信号valA携带穿过流水线。

译码和写回阶段共包含了Sel+FwdA, FwdB, d_dstE, d_dstM, d_srcA和d_srcB这几个逻辑控制块。

d_srcA和d_srcB用于指定需要访问读取的寄存器，而d_dstE和d_dstM用于指定需要写入的寄存器目标。

Sel+FwdA和FwdB主要功能为确定传向E寄存器的valA和valB信号来源，即d_valA和d_valB。默认情况下这两个逻辑块应当通过srcA和srcB将寄存器文件中的内容读出，但是考虑到数据冒险，因此如果存在对寄存器进行读写的情况，我们就应该通过转发来解决，因此存在额外的5个转发通道。

很明显的，在iaddq指令中，我们需要对寄存器rB进行读写操作。因此，我们需要对指令IIADDQ指定读取访问寄存器为rB，写入目标寄存器为rB。

因此，此部分修改为：

################ Decode Stage ######################################


## What register should be used as the A source?
word d_srcA = [
	D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ  } : D_rA;
	D_icode in { IPOPQ, IRET } : RRSP;
	1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word d_srcB = [
	D_icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ  } : D_rB;
	D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't need register
];

## What register should be used as the E destination?
word d_dstE = [
	D_icode in { IRRMOVQ, IIRMOVQ, IOPQ, IIADDQ} : D_rB;
	D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't write any register
];

## What register should be used as the M destination?
word d_dstM = [
	D_icode in { IMRMOVQ, IPOPQ } : D_rA;
	1 : RNONE;  # Don't write any register
];

## What should be the A value?
## Forward into decode stage for valA
word d_valA = [
	D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC
	d_srcA == e_dstE : e_valE;    # Forward valE from execute
	d_srcA == M_dstM : m_valM;    # Forward valM from memory
	d_srcA == M_dstE : M_valE;    # Forward valE from memory
	d_srcA == W_dstM : W_valM;    # Forward valM from write back
	d_srcA == W_dstE : W_valE;    # Forward valE from write back
	1 : d_rvalA;  # Use value read from register file
];

word d_valB = [
	d_srcB == e_dstE : e_valE;    # Forward valE from execute
	d_srcB == M_dstM : m_valM;    # Forward valM from memory
	d_srcB == M_dstE : M_valE;    # Forward valE from memory
	d_srcB == W_dstM : W_valM;    # Forward valM from write back
	d_srcB == W_dstE : W_valE;    # Forward valE from write back
	1 : d_rvalB;  # Use value read from register file
];

执行阶段 Excuse Stage

执行阶段中，其硬件单元和逻辑块同SEQ中十分相似，一方面对使用的信号进行了适当的重命名，一方面标号为Set CC的逻辑以信号m_stat和W_stat作为输入，这个逻辑决定了是否进行条件码的更新（例如导致异常的指令通过后方流水线时，任何对条件码的更新都应该被禁止）。

修改后逻辑如下：

################ Execute Stage #####################################

## Select input A to ALU
word aluA = [
	E_icode in { IRRMOVQ, IOPQ } : E_valA;
	E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : E_valC;
	E_icode in { ICALL, IPUSHQ } : -8;
	E_icode in { IRET, IPOPQ } : 8;
	# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
	E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, 
		     IPUSHQ, IRET, IPOPQ, IIADDQ } : E_valB;
	E_icode in { IRRMOVQ, IIRMOVQ } : 0;
	# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
	E_icode == IOPQ : E_ifun;
	1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = E_icode in {IOPQ, IIADDQ} &&
	# State changes only during normal operation
	!m_stat in { SADR, SINS, SHLT } && !W_stat in { SADR, SINS, SHLT };

## Generate valA in execute stage
word e_valA = E_valA;    # Pass valA through stage

## Set dstE to RNONE in event of not-taken conditional move
word e_dstE = [
	E_icode == IRRMOVQ && !e_Cnd : RNONE;
	1 : E_dstE;
];

访存阶段 Memory Stage

访存阶段的任务是根据给定的内存地址对内存进行读写，其中有mem_read, mem_write, mem_addr, mem_data和Stat，共5个控制块。

在上方译码过程中，我们提到了可以直接合并valA和valP两个信号，将他们作为信号valA携带穿过流水线。

总体上来说和原先的SEQ没有太大的区别。

mem_read表示是否只从内存读取数据的控制信号，同理mem_read表示是否只向内存写入数据的控制信号。

mem_addr控制块用来控制选择地址数据的来源（可以选择valE或者valA）。

最后Stat控制块根据取指阶段产生的icode, imem_error, instr_valid以及数据内存产生的dmem_error信号，从指令执行的结果来计算状态码Stat。

在IIADDQ指令中，并不存在和访存相关的操作，因此不需修改，维持原状：

################ Memory Stage ######################################

## Select memory address
word mem_addr = [
	M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : M_valE;
	M_icode in { IPOPQ, IRET } : M_valA;
	# Other instructions don't need address
];

## Set read control signal
bool mem_read = M_icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = M_icode in { IRMMOVQ, IPUSHQ, ICALL };

#/* $begin pipe-m_stat-hcl */
## Update the status
word m_stat = [
	dmem_error : SADR;
	1 : M_stat;
];
#/* $end pipe-m_stat-hcl */

## Set E port register ID
word w_dstE = W_dstE;

## Set E port value
word w_valE = W_valE;

## Set M port register ID
word w_dstM = W_dstM;

## Set M port value
word w_valM = W_valM;

## Update processor status
word Stat = [
	W_stat == SBUB : SAOK;
	1 : W_stat;
];

流水线寄存器控制 Pipeline Register Control

在出现一些特殊控制情况时，例如加载/使用冒险，处理ret指令，预测错误的分支，出现异常等，我们要对流水线进行期望的操作，需要对对应的寄存器进行暂停或者加入气泡的操作。

对应的控制操作如图：

此外，还有可能出现特殊情况组合的情形，共有两种组合情形，其处理方式如下：

很显然在此部分中不涉及IIADDQ的部分，因为其不涉及访存阶段，因而也不存在加载/使用冒险的情况。

根据以上的处理情形，我们可以得到：

################ Pipeline Register Control #########################

# Should I stall or inject a bubble into Pipeline Register F?
# At most one of these can be true.
bool F_bubble = 0;
bool F_stall =
	# Conditions for a load/use hazard
	E_icode in { IMRMOVQ, IPOPQ } &&
	 E_dstM in { d_srcA, d_srcB } ||
	# Stalling at fetch while ret passes through pipeline
	IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register D?
# At most one of these can be true.
bool D_stall = 
	# Conditions for a load/use hazard
	E_icode in { IMRMOVQ, IPOPQ } &&
	 E_dstM in { d_srcA, d_srcB };

bool D_bubble =
	# Mispredicted branch
	(E_icode == IJXX && !e_Cnd) ||
	# Stalling at fetch while ret passes through pipeline
	# but not condition for a load/use hazard
	!(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
	  IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register E?
# At most one of these can be true.
bool E_stall = 0;
bool E_bubble =
	# Mispredicted branch
	(E_icode == IJXX && !e_Cnd) ||
	# Conditions for a load/use hazard
	E_icode in { IMRMOVQ, IPOPQ } &&
	 E_dstM in { d_srcA, d_srcB};

# Should I stall or inject a bubble into Pipeline Register M?
# At most one of these can be true.
bool M_stall = 0;
# Start injecting bubbles as soon as exception passes through memory stage
bool M_bubble = m_stat in { SADR, SINS, SHLT } || W_stat in { SADR, SINS, SHLT };

# Should I stall or inject a bubble into Pipeline Register W?
bool W_stall = W_stat in { SADR, SINS, SHLT };
bool W_bubble = 0;

测试

不管怎么说，我们先对刚才修改完成的hcl文件进行测试，我们使用命令make clean; make VERSION=full。

我们在编译时也会遇到类似Part B的错误：

由于 tcl 内部有部分数据结构已经被弃用，编译时可能会爆出错误 psim.c:853:8: error: ‘Tcl_Interp’ {aka ‘struct Tcl_Interp’} has no member named ‘result’，详见 Stack overflow，最简单的处理办法就是在 Makefile 上的 CFLAGS 上添加参数 -DUSE_INTERP_RESULT。

此外还有可能遇到编译问题 /usr/bin/ld: /tmp/ccHA7fEj.o:(.data.rel+0x0): undefined reference to matherr，详见 Stack overflow - Fail to build y86-64 simulator from sources。目测可以直接将 psim.c 中与 matherr 相关的代码注释即可，如下：

成功之后，我们对hcl文件进行简单的Y86-64程序测试：

显示程序通过

执行cd ../ptest; make SIM=../pipe/psim TFLAGS=-i进行回归测试：

显示成功通过

程序优化

我们观察给出的ncopy.ys代码：

#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
	# Loop header
	xorq %rax,%rax		# count = 0;
	andq %rdx,%rdx		# len <= 0?
	jle Done		# if so, goto Done:

Loop:	mrmovq (%rdi), %r10	# read val from src...
	rmmovq %r10, (%rsi)	# ...and store it to dst
	andq %r10, %r10		# val <= 0?
	jle Npos		# if so, goto Npos:
	irmovq $1, %r10
	addq %r10, %rax		# count++
Npos:	irmovq $1, %r10
	subq %r10, %rdx		# len--
	irmovq $8, %r10
	addq %r10, %rdi		# src++
	addq %r10, %rsi		# dst++
	andq %rdx,%rdx		# len > 0?
	jg Loop			# if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

在没有做任何优化的情况下，我们尝试对原始代码进行CPE测试：

1 2	./correctness.pl ./benchmark.pl

得分惨烈。

P.S. 玄学原因，在ncopy.ys文件的最后一行请留出一行空行，不然correctness.pl运行会报错🤔

详情见"can’t find label" error in Y86 compiler

加入iaddq

我们刚好可以使用刚刚实现的iaddq指令，来实现自加自减等功能，而不需要寄存器r10来中间转换。

转换后的代码如下：

#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
	# Loop header
	xorq %rax,%rax		# count = 0;
	andq %rdx,%rdx		# len <= 0?
	jle Done		# if so, goto Done:

Loop:	
	mrmovq (%rdi), %r10	# read val from src...
	rmmovq %r10, (%rsi)	# ...and store it to dst
	andq %r10, %r10		# val <= 0?
	jle Npos		# if so, goto Npos:
	iaddq $1, %rax		# count++
Npos:	
	iaddq $-1, %rdx		# len--
	iaddq $8, %rdi		# src++
	iaddq $8, %rsi		# dst++
	andq %rdx,%rdx		# len > 0?
	jg Loop			# if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

测试 CPE结果：

循环展开&加载使用冒险

通过循环展开，我们可以：

减少索引计算的次数
减少条件分支的判断次数

我们尝试以6路展开为例：

#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion	
	# Loop header
	andq %rdx,%rdx		# len <= 0?
	jmp test
Loop:
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle Loop1
	iaddq $1,%rax
Loop1:
	mrmovq 8(%rdi),%r10
	rmmovq %r10,8(%rsi)
	andq %r10,%r10
	jle Loop2
	iaddq $1,%rax
Loop2:
	mrmovq 16(%rdi),%r10
	rmmovq %r10,16(%rsi)
	andq %r10,%r10
	jle Loop3
	iaddq $1,%rax
Loop3:
	mrmovq 24(%rdi),%r10
	rmmovq %r10,24(%rsi)
	andq %r10,%r10
	jle Loop4
	iaddq $1,%rax
Loop4:
	mrmovq 32(%rdi),%r10
	rmmovq %r10,32(%rsi)
	andq %r10,%r10
	jle Loop5
	iaddq $1,%rax
Loop5:
	mrmovq 40(%rdi),%r10
	rmmovq %r10,40(%rsi)
	iaddq $48,%rdi
	iaddq $48,%rsi
	andq %r10,%r10
	jle test
	iaddq $1,%rax	
test:
	iaddq $-6, %rdx			# 判断是否满6
	jge Loop				# 6路展开
	iaddq $-8,%rdi
	iaddq $-8,%rsi
	iaddq $6, %rdx	# 否则恢复原状
	jmp test2				#剩下的逐次一个一个处理
Remain:
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle test2
	iaddq $1,%rax
test2:
	iaddq $8,%rdi
	iaddq $8,%rsi
	iaddq $-1, %rdx
	jge Remain
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

在这样的操作下，我们通过CPE测试可以拿到18.3分：

可以看到对长度较小的数组，CPE数值还是很大，因为没有针对其进行循环展开，我们继续尝试对长度小于6的部分进行3路循环展开：

#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion	
	# Loop header
	andq %rdx,%rdx		# len <= 0?
	jmp test
Loop:
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle Loop1
	iaddq $1,%rax
Loop1:
	mrmovq 8(%rdi),%r10
	rmmovq %r10,8(%rsi)
	andq %r10,%r10
	jle Loop2
	iaddq $1,%rax
Loop2:
	mrmovq 16(%rdi),%r10
	rmmovq %r10,16(%rsi)
	andq %r10,%r10
	jle Loop3
	iaddq $1,%rax
Loop3:
	mrmovq 24(%rdi),%r10
	rmmovq %r10,24(%rsi)
	andq %r10,%r10
	jle Loop4
	iaddq $1,%rax
Loop4:
	mrmovq 32(%rdi),%r10
	rmmovq %r10,32(%rsi)
	andq %r10,%r10
	jle Loop5
	iaddq $1,%rax
Loop5:
	mrmovq 40(%rdi),%r10
	rmmovq %r10,40(%rsi)
	iaddq $48,%rdi
	iaddq $48,%rsi
	andq %r10,%r10
	jle test
	iaddq $1,%rax	
test:
	iaddq $-6, %rdx
	jge Loop
	iaddq $6, %rdx
	jmp test2
L:
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle L1
	iaddq $1,%rax
L1:
	mrmovq 8(%rdi),%r10
	rmmovq %r10,8(%rsi)
	andq %r10,%r10
	jle L2
	iaddq $1,%rax
L2:
	mrmovq 16(%rdi),%r10
	rmmovq %r10,16(%rsi)
	iaddq $24,%rdi
	iaddq $24,%rsi
	andq %r10,%r10
	jle test2
	iaddq $1,%rax
test2:
	iaddq $-3, %rdx			# 先减，判断够不够3个
	jge L
	iaddq $2, %rdx			# -1则不剩了，直接Done,0 剩一个, 1剩2个
	je R0
	jl Done
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle R2
	iaddq $1,%rax
R2:
	mrmovq 8(%rdi),%r10
	rmmovq %r10,8(%rsi)
	andq %r10,%r10
	jle Done
	iaddq $1,%rax
	jmp Done
R0:
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle Done
	iaddq $1,%rax
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

通过对剩余的长度小于6的数组进行3路循环展开优化，我们进一步提升了CPE测试结果：

消除气泡

在刚才的程序中，根据注释提示不难看出，存在加载/使用冒险：

1 2	mrmovq (%rdi), %r10 # read val from src... rmmovq %r10, (%rsi) # ...and store it to dst

流水线为了正常运作，由于访存阶段处于较为靠后的阶段，下方指令需要等待上方指令运行到访存阶段才能正确使用寄存器值，因此存在一个气泡的浪费。也就是：

1
2
3

mrmovq (%rdi), %r10	# read val from src...
# bubble...
rmmovq %r10, (%rsi)	# ...and store it to dst

我们可以考虑将这两个操作分开？中间插入一些其他没有依赖先后关系的后续操作，例如直接操作下一个位置的val，由于r10寄存器还有用，我们暂用r11存储。这样既消除了气泡，同时又没有浪费性能，只是将下一步的操作提前了：

mrmovq (%rdi), %r10
mrmovq 8(%rdi), %r11
rmmovq %r10, (%rsi)
rmmovq %r11, 8(%rsi)

最终的结果如下：

#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion	
	# Loop header
	andq %rdx,%rdx		# len <= 0?
	jmp test
Loop:
	mrmovq (%rdi),%r10
	mrmovq 8(%rdi),%r11
	rmmovq %r10,(%rsi)
	rmmovq %r11,8(%rsi)
	andq %r10,%r10
	jle Loop1
	iaddq $1,%rax
Loop1:
	andq %r11,%r11
	jle Loop2
	iaddq $1,%rax
Loop2:
	mrmovq 16(%rdi),%r10
	mrmovq 24(%rdi),%r11
	rmmovq %r10,16(%rsi)
	rmmovq %r11,24(%rsi)
	andq %r10,%r10
	jle Loop3
	iaddq $1,%rax
Loop3:
	andq %r11,%r11
	jle Loop4
	iaddq $1,%rax
Loop4:
	mrmovq 32(%rdi),%r10
	mrmovq 40(%rdi),%r11
	rmmovq %r10,32(%rsi)
	rmmovq %r11,40(%rsi)
	andq %r10,%r10
	jle Loop5
	iaddq $1,%rax
Loop5:
	iaddq $48,%rdi
	iaddq $48,%rsi
	andq %r11,%r11
	jle test
	iaddq $1,%rax	
test:
	iaddq $-6, %rdx
	jge Loop
	iaddq $6, %rdx
	jmp test2
L:
	mrmovq (%rdi),%r10
	mrmovq 8(%rdi),%r11
	rmmovq %r10,(%rsi)
	rmmovq %r11,8(%rsi)
	andq %r10,%r10
	jle L1
	iaddq $1,%rax
L1:
	andq %r11,%r11
	jle L2
	iaddq $1,%rax
L2:
	mrmovq 16(%rdi),%r10
	iaddq $24,%rdi
	rmmovq %r10,16(%rsi)
	iaddq $24,%rsi
	andq %r10,%r10
	jle test2
	iaddq $1,%rax
test2:
	iaddq $-3, %rdx			# 先减，判断够不够3个
	jge L
	iaddq $2, %rdx			# -1则不剩了，直接Done,0 剩一个, 1剩2个
	je R0
	jl Done
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle R2
	iaddq $1,%rax
R2:
	mrmovq 8(%rdi),%r10
	rmmovq %r10,8(%rsi)
	andq %r10,%r10
	jle Done
	iaddq $1,%rax
	jmp Done
R0:
	mrmovq (%rdi),%r10
	rmmovq %r10,(%rsi)
	andq %r10,%r10
	jle Done
	iaddq $1,%rax
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */