From mboxrd@z Thu Jan 1 00:00:00 1970 From: Robin Holt Date: Mon, 07 Aug 2006 15:46:08 +0000 Subject: Silent data corruption caused by XPC. Message-Id: <20060807154608.GC15674@lnx-holt.americas.sgi.com> List-Id: MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit To: linux-ia64@vger.kernel.org Jack Steiner identified a problem where XPC can cause a silent data corruption. On module load, the placement may cause the xpc_remote_copy_buffer to span two physical pages. DMA transfers are done to the start virtual address translated to physical. This patch changes the buffer from a statically allocated buffer to a kmalloc'd buffer. Dean Nelson reviewed this before posting. I have tested it in the configuration that was showing the memory corruption and verified it works. I also added DBUG_ON statements to help catch this if a similar situation is encountered. Signed-off-by: Robin Holt Signed-off-by: Dean Nelson Signed-off-by: Jack Steiner Index: linux-2.6/arch/ia64/sn/kernel/xpc_channel.c =================================--- linux-2.6.orig/arch/ia64/sn/kernel/xpc_channel.c 2006-05-03 19:37:42.430685318 -0500 +++ linux-2.6/arch/ia64/sn/kernel/xpc_channel.c 2006-08-07 10:38:54.827204041 -0500 @@ -255,6 +255,7 @@ xpc_pull_remote_cachelines(struct xpc_pa DBUG_ON((u64) src != L1_CACHE_ALIGN((u64) src)); DBUG_ON((u64) dst != L1_CACHE_ALIGN((u64) dst)); DBUG_ON(cnt != L1_CACHE_ALIGN(cnt)); + DBUG_ON((ia64_tpa(dst) + cnt) != ia64_tpa(&((char *) dst)[cnt])); if (part->act_state = XPC_P_DEACTIVATING) { return part->reason; Index: linux-2.6/arch/ia64/sn/kernel/xpc_partition.c =================================--- linux-2.6.orig/arch/ia64/sn/kernel/xpc_partition.c 2006-05-03 19:37:42.434684918 -0500 +++ linux-2.6/arch/ia64/sn/kernel/xpc_partition.c 2006-08-07 10:38:54.835205009 -0500 @@ -71,13 +71,9 @@ struct xpc_partition xpc_partitions[XP_M * Generic buffer used to store a local copy of portions of a remote * partition's reserved page (either its header and part_nasids mask, * or its vars). - * - * xpc_discovery runs only once and is a seperate thread that is - * very likely going to be processing in parallel with receiving - * interrupts. */ -char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE + - XP_NASID_MASK_BYTES]; +char *xpc_remote_copy_buffer; +void *xpc_remote_copy_buffer_base; /* @@ -125,6 +121,9 @@ xpc_get_rsvd_page_pa(int nasid) } } + DBUG_ON((ia64_tpa(buf) + buf_len) !+ ia64_tpa(&((char *) buf)[buf_len])); + bte_res = xp_bte_copy(rp_pa, ia64_tpa(buf), buf_len, (BTE_NOTIFY | BTE_WACQUIRE), NULL); if (bte_res != BTE_SUCCESS) { @@ -424,6 +423,9 @@ xpc_check_remote_hb(void) continue; } + DBUG_ON((ia64_tpa(remote_vars) + XPC_RP_VARS_SIZE) !+ ia64_tpa(&((char *) remote_vars)[XPC_RP_VARS_SIZE])); + /* pull the remote_hb cache line */ bres = xp_bte_copy(part->remote_vars_pa, ia64_tpa((u64) remote_vars), @@ -475,9 +477,11 @@ xpc_get_remote_rp(int nasid, u64 *discov return xpcNoRsvdPageAddr; } + DBUG_ON((ia64_tpa(remote_rp + XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes) !+ ia64_tpa(&((char *) remote_rp)[XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes]))); - /* pull over the reserved page header and part_nasids mask */ + /* pull over the reserved page header and part_nasids mask */ bres = xp_bte_copy(*remote_rp_pa, ia64_tpa((u64) remote_rp), XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE), NULL); @@ -533,9 +537,11 @@ xpc_get_remote_vars(u64 remote_vars_pa, return xpcVarsNotSet; } + DBUG_ON((ia64_tpa(remote_vars) + XPC_RP_VARS_SIZE) !+ ia64_tpa(&((char *) remote_vars)[XPC_RP_VARS_SIZE])); - /* pull over the cross partition variables */ + /* pull over the cross partition variables */ bres = xp_bte_copy(remote_vars_pa, ia64_tpa((u64) remote_vars), XPC_RP_VARS_SIZE, (BTE_NOTIFY | BTE_WACQUIRE), NULL); @@ -1219,6 +1225,9 @@ xpc_initiate_partid_to_nasids(partid_t p part_nasid_pa = (u64) XPC_RP_PART_NASIDS(part->remote_rp_pa); + DBUG_ON((ia64_tpa(nasid_mask) + xp_nasid_mask_bytes) !+ ia64_tpa(&((char *) nasid_mask)[xp_nasid_mask_bytes])); + bte_res = xp_bte_copy(part_nasid_pa, ia64_tpa((u64) nasid_mask), xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE), NULL); Index: linux-2.6/arch/ia64/sn/kernel/xpc_main.c =================================--- linux-2.6.orig/arch/ia64/sn/kernel/xpc_main.c 2006-05-03 19:37:42.434684918 -0500 +++ linux-2.6/arch/ia64/sn/kernel/xpc_main.c 2006-08-07 10:38:54.839205493 -0500 @@ -1053,6 +1053,8 @@ xpc_do_exit(enum xpc_retval reason) if (xpc_sysctl) { unregister_sysctl_table(xpc_sysctl); } + + kfree(xpc_remote_copy_buffer_base); } @@ -1213,24 +1215,19 @@ xpc_init(void) partid_t partid; struct xpc_partition *part; pid_t pid; + size_t buf_size; if (!ia64_platform_is("sn2")) { return -ENODEV; } - /* - * xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng - * various portions of a partition's reserved page. Its size is based - * on the size of the reserved page header and part_nasids mask. So we - * need to ensure that the other items will fit as well. - */ - if (XPC_RP_VARS_SIZE > XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES) { - dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n"); - return -EPERM; - } - DBUG_ON((u64) xpc_remote_copy_buffer !- L1_CACHE_ALIGN((u64) xpc_remote_copy_buffer)); + + buf_size = max(XPC_RP_VARS_SIZE, XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES); + xpc_remote_copy_buffer = xpc_kmalloc_cacheline_aligned(buf_size, GFP_KERNEL, + &xpc_remote_copy_buffer_base); + if (xpc_remote_copy_buffer = NULL) + return -ENOMEM; snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part"); snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan"); @@ -1294,6 +1291,8 @@ xpc_init(void) if (xpc_sysctl) { unregister_sysctl_table(xpc_sysctl); } + + kfree(xpc_remote_copy_buffer_base); return -EBUSY; } @@ -1312,6 +1311,8 @@ xpc_init(void) if (xpc_sysctl) { unregister_sysctl_table(xpc_sysctl); } + + kfree(xpc_remote_copy_buffer_base); return -EBUSY; } @@ -1363,6 +1364,8 @@ xpc_init(void) if (xpc_sysctl) { unregister_sysctl_table(xpc_sysctl); } + + kfree(xpc_remote_copy_buffer_base); return -EBUSY; } Index: linux-2.6/include/asm-ia64/sn/xpc.h =================================--- linux-2.6.orig/include/asm-ia64/sn/xpc.h 2006-05-03 19:37:46.650263229 -0500 +++ linux-2.6/include/asm-ia64/sn/xpc.h 2006-08-07 10:38:55.011226303 -0500 @@ -684,7 +684,8 @@ extern struct xpc_vars *xpc_vars; extern struct xpc_rsvd_page *xpc_rsvd_page; extern struct xpc_vars_part *xpc_vars_part; extern struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1]; -extern char xpc_remote_copy_buffer[]; +extern char *xpc_remote_copy_buffer; +extern void *xpc_remote_copy_buffer_base; extern struct xpc_rsvd_page *xpc_rsvd_page_init(void); extern void xpc_allow_IPI_ops(void); extern void xpc_restrict_IPI_ops(void);