diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/Kconfig | 16 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/Makefile | 5 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-nvram.c | 88 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-rtc.c | 97 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-takeover.S | 140 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-wrappers.S | 109 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal.c | 322 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 1340 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-p5ioc2.c | 235 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 609 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 148 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/powernv.h | 16 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/setup.c | 195 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/smp.c | 181 |
14 files changed, 3501 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig new file mode 100644 index 00000000..74fea5c2 --- /dev/null +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -0,0 +1,16 @@ +config PPC_POWERNV + depends on PPC64 && PPC_BOOK3S + bool "IBM PowerNV (Non-Virtualized) platform support" + select PPC_NATIVE + select PPC_XICS + select PPC_ICP_NATIVE + select PPC_P7_NAP + select PPC_PCI_CHOICE if EMBEDDED + default y + +config PPC_POWERNV_RTAS + depends on PPC_POWERNV + bool "Support for RTAS based PowerNV platforms such as BML" + default y + select PPC_ICS_RTAS + select PPC_RTAS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile new file mode 100644 index 00000000..bcc3cb48 --- /dev/null +++ b/arch/powerpc/platforms/powernv/Makefile @@ -0,0 +1,5 @@ +obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o +obj-y += opal-rtc.o opal-nvram.o + +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c new file mode 100644 index 00000000..3f83e1ae --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -0,0 +1,88 @@ +/* + * PowerNV nvram code. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> + +#include <asm/opal.h> +#include <asm/machdep.h> + +static unsigned int nvram_size; + +static ssize_t opal_nvram_size(void) +{ + return nvram_size; +} + +static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index) +{ + s64 rc; + int off; + + if (*index >= nvram_size) + return 0; + off = *index; + if ((off + count) > nvram_size) + count = nvram_size - off; + rc = opal_read_nvram(__pa(buf), count, off); + if (rc != OPAL_SUCCESS) + return -EIO; + *index += count; + return count; +} + +static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) +{ + s64 rc = OPAL_BUSY; + int off; + + if (*index >= nvram_size) + return 0; + off = *index; + if ((off + count) > nvram_size) + count = nvram_size - off; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_write_nvram(__pa(buf), count, off); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + } + *index += count; + return count; +} + +void __init opal_nvram_init(void) +{ + struct device_node *np; + const u32 *nbytes_p; + + np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram"); + if (np == NULL) + return; + + nbytes_p = of_get_property(np, "#bytes", NULL); + if (!nbytes_p) { + of_node_put(np); + return; + } + nvram_size = *nbytes_p; + + printk(KERN_INFO "OPAL nvram setup, %u bytes\n", nvram_size); + of_node_put(np); + + ppc_md.nvram_read = opal_nvram_read; + ppc_md.nvram_write = opal_nvram_write; + ppc_md.nvram_size = opal_nvram_size; +} + diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c new file mode 100644 index 00000000..2aa7641a --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -0,0 +1,97 @@ +/* + * PowerNV Real Time Clock. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/bcd.h> +#include <linux/rtc.h> +#include <linux/delay.h> + +#include <asm/opal.h> +#include <asm/firmware.h> + +static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) +{ + tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) + + bcd2bin((y_m_d >> 16) & 0xff)) - 1900; + tm->tm_mon = bcd2bin((y_m_d >> 8) & 0xff) - 1; + tm->tm_mday = bcd2bin(y_m_d & 0xff); + tm->tm_hour = bcd2bin((h_m_s_ms >> 56) & 0xff); + tm->tm_min = bcd2bin((h_m_s_ms >> 48) & 0xff); + tm->tm_sec = bcd2bin((h_m_s_ms >> 40) & 0xff); + + GregorianDay(tm); +} + +unsigned long __init opal_get_boot_time(void) +{ + struct rtc_time tm; + u32 y_m_d; + u64 h_m_s_ms; + long rc = OPAL_BUSY; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_rtc_read(&y_m_d, &h_m_s_ms); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + if (rc != OPAL_SUCCESS) + return 0; + opal_to_tm(y_m_d, h_m_s_ms, &tm); + return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); +} + +void opal_get_rtc_time(struct rtc_time *tm) +{ + long rc = OPAL_BUSY; + u32 y_m_d; + u64 h_m_s_ms; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_rtc_read(&y_m_d, &h_m_s_ms); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + if (rc != OPAL_SUCCESS) + return; + opal_to_tm(y_m_d, h_m_s_ms, tm); +} + +int opal_set_rtc_time(struct rtc_time *tm) +{ + long rc = OPAL_BUSY; + u32 y_m_d = 0; + u64 h_m_s_ms = 0; + + y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) / 100)) << 24; + y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) % 100)) << 16; + y_m_d |= ((u32)bin2bcd((tm->tm_mon + 1))) << 8; + y_m_d |= ((u32)bin2bcd(tm->tm_mday)); + + h_m_s_ms |= ((u64)bin2bcd(tm->tm_hour)) << 56; + h_m_s_ms |= ((u64)bin2bcd(tm->tm_min)) << 48; + h_m_s_ms |= ((u64)bin2bcd(tm->tm_sec)) << 40; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_rtc_write(y_m_d, h_m_s_ms); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + return rc == OPAL_SUCCESS ? 0 : -EIO; +} diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S new file mode 100644 index 00000000..77b48b2b --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-takeover.S @@ -0,0 +1,140 @@ +/* + * PowerNV OPAL takeover assembly code, for use by prom_init.c + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/ppc_asm.h> +#include <asm/hvcall.h> +#include <asm/asm-offsets.h> +#include <asm/opal.h> + +#define STK_PARAM(i) (48 + ((i)-3)*8) + +#define H_HAL_TAKEOVER 0x5124 +#define H_HAL_TAKEOVER_QUERY_MAGIC -1 + + .text +_GLOBAL(opal_query_takeover) + mfcr r0 + stw r0,8(r1) + std r3,STK_PARAM(r3)(r1) + std r4,STK_PARAM(r4)(r1) + li r3,H_HAL_TAKEOVER + li r4,H_HAL_TAKEOVER_QUERY_MAGIC + HVSC + ld r10,STK_PARAM(r3)(r1) + std r4,0(r10) + ld r10,STK_PARAM(r4)(r1) + std r5,0(r10) + lwz r0,8(r1) + mtcrf 0xff,r0 + blr + +_GLOBAL(opal_do_takeover) + mfcr r0 + stw r0,8(r1) + mflr r0 + std r0,16(r1) + bl __opal_do_takeover + ld r0,16(r1) + mtlr r0 + lwz r0,8(r1) + mtcrf 0xff,r0 + blr + +__opal_do_takeover: + ld r4,0(r3) + ld r5,0x8(r3) + ld r6,0x10(r3) + ld r7,0x18(r3) + ld r8,0x20(r3) + ld r9,0x28(r3) + ld r10,0x30(r3) + ld r11,0x38(r3) + li r3,H_HAL_TAKEOVER + HVSC + blr + + .globl opal_secondary_entry +opal_secondary_entry: + mr r31,r3 + mfmsr r11 + li r12,(MSR_SF | MSR_ISF)@highest + sldi r12,r12,48 + or r11,r11,r12 + mtmsrd r11 + isync + mfspr r4,SPRN_PIR + std r4,0(r3) +1: HMT_LOW + ld r4,8(r3) + cmpli cr0,r4,0 + beq 1b + HMT_MEDIUM +1: addi r3,r31,16 + bl __opal_do_takeover + b 1b + +_GLOBAL(opal_enter_rtas) + mflr r0 + std r0,16(r1) + stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ + + /* Because PROM is running in 32b mode, it clobbers the high order half + * of all registers that it saves. We therefore save those registers + * PROM might touch to the stack. (r0, r3-r13 are caller saved) + */ + SAVE_GPR(2, r1) + SAVE_GPR(13, r1) + SAVE_8GPRS(14, r1) + SAVE_10GPRS(22, r1) + mfcr r10 + mfmsr r11 + std r10,_CCR(r1) + std r11,_MSR(r1) + + /* Get the PROM entrypoint */ + mtlr r5 + + /* Switch MSR to 32 bits mode + */ + li r12,1 + rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) + andc r11,r11,r12 + li r12,1 + rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) + andc r11,r11,r12 + mtmsrd r11 + isync + + /* Enter RTAS here... */ + blrl + + /* Just make sure that r1 top 32 bits didn't get + * corrupt by OF + */ + rldicl r1,r1,0,32 + + /* Restore the MSR (back to 64 bits) */ + ld r0,_MSR(r1) + MTMSRD(r0) + isync + + /* Restore other registers */ + REST_GPR(2, r1) + REST_GPR(13, r1) + REST_8GPRS(14, r1) + REST_10GPRS(22, r1) + ld r4,_CCR(r1) + mtcr r4 + + addi r1,r1,PROM_FRAME_SIZE + ld r0,16(r1) + mtlr r0 + blr diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S new file mode 100644 index 00000000..3bb07e5e --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -0,0 +1,109 @@ +/* + * PowerNV OPAL API wrappers + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/ppc_asm.h> +#include <asm/hvcall.h> +#include <asm/asm-offsets.h> +#include <asm/opal.h> + +/* TODO: + * + * - Trace irqs in/off (needs saving/restoring all args, argh...) + * - Get r11 feed up by Dave so I can have better register usage + */ +#define OPAL_CALL(name, token) \ + _GLOBAL(name); \ + mflr r0; \ + mfcr r12; \ + std r0,16(r1); \ + std r12,8(r1); \ + std r1,PACAR1(r13); \ + li r0,0; \ + mfmsr r12; \ + ori r0,r0,MSR_EE; \ + std r12,PACASAVEDMSR(r13); \ + andc r12,r12,r0; \ + mtmsrd r12,1; \ + LOAD_REG_ADDR(r0,.opal_return); \ + mtlr r0; \ + li r0,MSR_DR|MSR_IR; \ + andc r12,r12,r0; \ + li r0,token; \ + mtspr SPRN_HSRR1,r12; \ + LOAD_REG_ADDR(r11,opal); \ + ld r12,8(r11); \ + ld r2,0(r11); \ + mtspr SPRN_HSRR0,r12; \ + hrfid + +_STATIC(opal_return) + ld r2,PACATOC(r13); + ld r4,8(r1); + ld r5,16(r1); + ld r6,PACASAVEDMSR(r13); + mtspr SPRN_SRR0,r5; + mtspr SPRN_SRR1,r6; + mtcr r4; + rfid + +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c new file mode 100644 index 00000000..aaa0dba4 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal.c @@ -0,0 +1,322 @@ +/* + * PowerNV OPAL high level interfaces + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/types.h> +#include <linux/of.h> +#include <linux/of_platform.h> +#include <linux/interrupt.h> +#include <asm/opal.h> +#include <asm/firmware.h> + +#include "powernv.h" + +struct opal { + u64 base; + u64 entry; +} opal; + +static struct device_node *opal_node; +static DEFINE_SPINLOCK(opal_write_lock); +extern u64 opal_mc_secondary_handler[]; + +int __init early_init_dt_scan_opal(unsigned long node, + const char *uname, int depth, void *data) +{ + const void *basep, *entryp; + unsigned long basesz, entrysz; + u64 glue; + + if (depth != 1 || strcmp(uname, "ibm,opal") != 0) + return 0; + + basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz); + entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz); + + if (!basep || !entryp) + return 1; + + opal.base = of_read_number(basep, basesz/4); + opal.entry = of_read_number(entryp, entrysz/4); + + pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%ld)\n", + opal.base, basep, basesz); + pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%ld)\n", + opal.entry, entryp, entrysz); + + powerpc_firmware_features |= FW_FEATURE_OPAL; + if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { + powerpc_firmware_features |= FW_FEATURE_OPALv2; + printk("OPAL V2 detected !\n"); + } else { + printk("OPAL V1 detected !\n"); + } + + /* Hookup some exception handlers. We use the fwnmi area at 0x7000 + * to provide the glue space to OPAL + */ + glue = 0x7000; + opal_register_exception_handler(OPAL_MACHINE_CHECK_HANDLER, + __pa(opal_mc_secondary_handler[0]), + glue); + glue += 128; + opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER, + 0, glue); + glue += 128; + opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); + + return 1; +} + +int opal_get_chars(uint32_t vtermno, char *buf, int count) +{ + s64 len, rc; + u64 evt; + + if (!opal.entry) + return -ENODEV; + opal_poll_events(&evt); + if ((evt & OPAL_EVENT_CONSOLE_INPUT) == 0) + return 0; + len = count; + rc = opal_console_read(vtermno, &len, buf); + if (rc == OPAL_SUCCESS) + return len; + return 0; +} + +int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +{ + int written = 0; + s64 len, rc; + unsigned long flags; + u64 evt; + + if (!opal.entry) + return -ENODEV; + + /* We want put_chars to be atomic to avoid mangling of hvsi + * packets. To do that, we first test for room and return + * -EAGAIN if there isn't enough. + * + * Unfortunately, opal_console_write_buffer_space() doesn't + * appear to work on opal v1, so we just assume there is + * enough room and be done with it + */ + spin_lock_irqsave(&opal_write_lock, flags); + if (firmware_has_feature(FW_FEATURE_OPALv2)) { + rc = opal_console_write_buffer_space(vtermno, &len); + if (rc || len < total_len) { + spin_unlock_irqrestore(&opal_write_lock, flags); + /* Closed -> drop characters */ + if (rc) + return total_len; + opal_poll_events(&evt); + return -EAGAIN; + } + } + + /* We still try to handle partial completions, though they + * should no longer happen. + */ + rc = OPAL_BUSY; + while(total_len > 0 && (rc == OPAL_BUSY || + rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { + len = total_len; + rc = opal_console_write(vtermno, &len, data); + if (rc == OPAL_SUCCESS) { + total_len -= len; + data += len; + written += len; + } + /* This is a bit nasty but we need that for the console to + * flush when there aren't any interrupts. We will clean + * things a bit later to limit that to synchronous path + * such as the kernel console and xmon/udbg + */ + do + opal_poll_events(&evt); + while(rc == OPAL_SUCCESS && (evt & OPAL_EVENT_CONSOLE_OUTPUT)); + } + spin_unlock_irqrestore(&opal_write_lock, flags); + return written; +} + +int opal_machine_check(struct pt_regs *regs) +{ + struct opal_machine_check_event *opal_evt = get_paca()->opal_mc_evt; + struct opal_machine_check_event evt; + const char *level, *sevstr, *subtype; + static const char *opal_mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + static const char *opal_mc_slb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *opal_mc_erat_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *opal_mc_tlb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + + /* Copy the event structure and release the original */ + evt = *opal_evt; + opal_evt->in_use = 0; + + /* Print things out */ + if (evt.version != OpalMCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; + } + switch(evt.severity) { + case OpalMCE_SEV_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case OpalMCE_SEV_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case OpalMCE_SEV_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case OpalMCE_SEV_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + evt.disposition == OpalMCE_DISPOSITION_RECOVERED ? + "Recovered" : "[Not recovered"); + printk("%s Initiator: %s\n", level, + evt.initiator == OpalMCE_INITIATOR_CPU ? "CPU" : "Unknown"); + switch(evt.error_type) { + case OpalMCE_ERROR_TYPE_UE: + subtype = evt.u.ue_error.ue_error_type < + ARRAY_SIZE(opal_mc_ue_types) ? + opal_mc_ue_types[evt.u.ue_error.ue_error_type] + : "Unknown"; + printk("%s Error type: UE [%s]\n", level, subtype); + if (evt.u.ue_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.ue_error.effective_address); + if (evt.u.ue_error.physical_address_provided) + printk("%s Physial address: %016llx\n", + level, evt.u.ue_error.physical_address); + break; + case OpalMCE_ERROR_TYPE_SLB: + subtype = evt.u.slb_error.slb_error_type < + ARRAY_SIZE(opal_mc_slb_types) ? + opal_mc_slb_types[evt.u.slb_error.slb_error_type] + : "Unknown"; + printk("%s Error type: SLB [%s]\n", level, subtype); + if (evt.u.slb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.slb_error.effective_address); + break; + case OpalMCE_ERROR_TYPE_ERAT: + subtype = evt.u.erat_error.erat_error_type < + ARRAY_SIZE(opal_mc_erat_types) ? + opal_mc_erat_types[evt.u.erat_error.erat_error_type] + : "Unknown"; + printk("%s Error type: ERAT [%s]\n", level, subtype); + if (evt.u.erat_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.erat_error.effective_address); + break; + case OpalMCE_ERROR_TYPE_TLB: + subtype = evt.u.tlb_error.tlb_error_type < + ARRAY_SIZE(opal_mc_tlb_types) ? + opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type] + : "Unknown"; + printk("%s Error type: TLB [%s]\n", level, subtype); + if (evt.u.tlb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.tlb_error.effective_address); + break; + default: + case OpalMCE_ERROR_TYPE_UNKNOWN: + printk("%s Error type: Unknown\n", level); + break; + } + return evt.severity == OpalMCE_SEV_FATAL ? 0 : 1; +} + +static irqreturn_t opal_interrupt(int irq, void *data) +{ + uint64_t events; + + opal_handle_interrupt(virq_to_hw(irq), &events); + + /* XXX TODO: Do something with the events */ + + return IRQ_HANDLED; +} + +static int __init opal_init(void) +{ + struct device_node *np, *consoles; + const u32 *irqs; + int rc, i, irqlen; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_warn("opal: Node not found\n"); + return -ENODEV; + } + if (firmware_has_feature(FW_FEATURE_OPALv2)) + consoles = of_find_node_by_path("/ibm,opal/consoles"); + else + consoles = of_node_get(opal_node); + + /* Register serial ports */ + for_each_child_of_node(consoles, np) { + if (strcmp(np->name, "serial")) + continue; + of_platform_device_create(np, NULL, NULL); + } + of_node_put(consoles); + + /* Find all OPAL interrupts and request them */ + irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); + pr_debug("opal: Found %d interrupts reserved for OPAL\n", + irqs ? (irqlen / 4) : 0); + for (i = 0; irqs && i < (irqlen / 4); i++, irqs++) { + unsigned int hwirq = be32_to_cpup(irqs); + unsigned int irq = irq_create_mapping(NULL, hwirq); + if (irq == NO_IRQ) { + pr_warning("opal: Failed to map irq 0x%x\n", hwirq); + continue; + } + rc = request_irq(irq, opal_interrupt, 0, "opal", NULL); + if (rc) + pr_warning("opal: Error %d requesting irq %d" + " (0x%x)\n", rc, irq, hwirq); + } + return 0; +} +subsys_initcall(opal_init); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c new file mode 100644 index 00000000..fbdd74da --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -0,0 +1,1340 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/abs_addr.h> + +#include "powernv.h" +#include "pci.h" + +struct resource_wrap { + struct list_head link; + resource_size_t size; + resource_size_t align; + struct pci_dev *dev; /* Set if it's a device */ + struct pci_bus *bus; /* Set if it's a bridge */ +}; + +static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe, + struct va_format *vaf) +{ + char pfix[32]; + + if (pe->pdev) + strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); + else + sprintf(pfix, "%04x:%02x ", + pci_domain_nr(pe->pbus), pe->pbus->number); + return printk("pci %s%s: [PE# %.3d] %pV", level, pfix, pe->pe_number, vaf); +} + +#define define_pe_printk_level(func, kern_level) \ +static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + int r; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __pe_printk(kern_level, pe, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ + +define_pe_printk_level(pe_err, KERN_ERR); +define_pe_printk_level(pe_warn, KERN_WARNING); +define_pe_printk_level(pe_info, KERN_INFO); + + +/* Calculate resource usage & alignment requirement of a single + * device. This will also assign all resources within the device + * for a given type starting at 0 for the biggest one and then + * assigning in decreasing order of size. + */ +static void __devinit pnv_ioda_calc_dev(struct pci_dev *dev, unsigned int flags, + resource_size_t *size, + resource_size_t *align) +{ + resource_size_t start; + struct resource *r; + int i; + + pr_devel(" -> CDR %s\n", pci_name(dev)); + + *size = *align = 0; + + /* Clear the resources out and mark them all unset */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (!(r->flags & flags)) + continue; + if (r->start) { + r->end -= r->start; + r->start = 0; + } + r->flags |= IORESOURCE_UNSET; + } + + /* We currently keep all memory resources together, we + * will handle prefetch & 64-bit separately in the future + * but for now we stick everybody in M32 + */ + start = 0; + for (;;) { + resource_size_t max_size = 0; + int max_no = -1; + + /* Find next biggest resource */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (!(r->flags & IORESOURCE_UNSET) || + !(r->flags & flags)) + continue; + if (resource_size(r) > max_size) { + max_size = resource_size(r); + max_no = i; + } + } + if (max_no < 0) + break; + r = &dev->resource[max_no]; + if (max_size > *align) + *align = max_size; + *size += max_size; + r->start = start; + start += max_size; + r->end = r->start + max_size - 1; + r->flags &= ~IORESOURCE_UNSET; + pr_devel(" -> R%d %016llx..%016llx\n", + max_no, r->start, r->end); + } + pr_devel(" <- CDR %s size=%llx align=%llx\n", + pci_name(dev), *size, *align); +} + +/* Allocate a resource "wrap" for a given device or bridge and + * insert it at the right position in the sorted list + */ +static void __devinit pnv_ioda_add_wrap(struct list_head *list, + struct pci_bus *bus, + struct pci_dev *dev, + resource_size_t size, + resource_size_t align) +{ + struct resource_wrap *w1, *w = kzalloc(sizeof(*w), GFP_KERNEL); + + w->size = size; + w->align = align; + w->dev = dev; + w->bus = bus; + + list_for_each_entry(w1, list, link) { + if (w1->align < align) { + list_add_tail(&w->link, &w1->link); + return; + } + } + list_add_tail(&w->link, list); +} + +/* Offset device resources of a given type */ +static void __devinit pnv_ioda_offset_dev(struct pci_dev *dev, + unsigned int flags, + resource_size_t offset) +{ + struct resource *r; + int i; + + pr_devel(" -> ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); + + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (r->flags & flags) { + dev->resource[i].start += offset; + dev->resource[i].end += offset; + } + } + + pr_devel(" <- ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); +} + +/* Offset bus resources (& all children) of a given type */ +static void __devinit pnv_ioda_offset_bus(struct pci_bus *bus, + unsigned int flags, + resource_size_t offset) +{ + struct resource *r; + struct pci_dev *dev; + struct pci_bus *cbus; + int i; + + pr_devel(" -> OBR %s [%x] +%016llx\n", + bus->self ? pci_name(bus->self) : "root", flags, offset); + + pci_bus_for_each_resource(bus, r, i) { + if (r && (r->flags & flags)) { + r->start += offset; + r->end += offset; + } + } + list_for_each_entry(dev, &bus->devices, bus_list) + pnv_ioda_offset_dev(dev, flags, offset); + list_for_each_entry(cbus, &bus->children, node) + pnv_ioda_offset_bus(cbus, flags, offset); + + pr_devel(" <- OBR %s [%x]\n", + bus->self ? pci_name(bus->self) : "root", flags); +} + +/* This is the guts of our IODA resource allocation. This is called + * recursively for each bus in the system. It calculates all the + * necessary size and requirements for children and assign them + * resources such that: + * + * - Each function fits in it's own contiguous set of IO/M32 + * segment + * + * - All segments behind a P2P bridge are contiguous and obey + * alignment constraints of those bridges + */ +static void __devinit pnv_ioda_calc_bus(struct pci_bus *bus, unsigned int flags, + resource_size_t *size, + resource_size_t *align) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + resource_size_t dev_size, dev_align, start; + resource_size_t min_align, min_balign; + struct pci_dev *cdev; + struct pci_bus *cbus; + struct list_head head; + struct resource_wrap *w; + unsigned int bres; + + *size = *align = 0; + + pr_devel("-> CBR %s [%x]\n", + bus->self ? pci_name(bus->self) : "root", flags); + + /* Calculate alignment requirements based on the type + * of resource we are working on + */ + if (flags & IORESOURCE_IO) { + bres = 0; + min_align = phb->ioda.io_segsize; + min_balign = 0x1000; + } else { + bres = 1; + min_align = phb->ioda.m32_segsize; + min_balign = 0x100000; + } + + /* Gather all our children resources ordered by alignment */ + INIT_LIST_HEAD(&head); + + /* - Busses */ + list_for_each_entry(cbus, &bus->children, node) { + pnv_ioda_calc_bus(cbus, flags, &dev_size, &dev_align); + pnv_ioda_add_wrap(&head, cbus, NULL, dev_size, dev_align); + } + + /* - Devices */ + list_for_each_entry(cdev, &bus->devices, bus_list) { + pnv_ioda_calc_dev(cdev, flags, &dev_size, &dev_align); + /* Align them to segment size */ + if (dev_align < min_align) + dev_align = min_align; + pnv_ioda_add_wrap(&head, NULL, cdev, dev_size, dev_align); + } + if (list_empty(&head)) + goto empty; + + /* Now we can do two things: assign offsets to them within that + * level and get our total alignment & size requirements. The + * assignment algorithm is going to be uber-trivial for now, we + * can try to be smarter later at filling out holes. + */ + if (bus->self) { + /* No offset for downstream bridges */ + start = 0; + } else { + /* Offset from the root */ + if (flags & IORESOURCE_IO) + /* Don't hand out IO 0 */ + start = hose->io_resource.start + 0x1000; + else + start = hose->mem_resources[0].start; + } + while(!list_empty(&head)) { + w = list_first_entry(&head, struct resource_wrap, link); + list_del(&w->link); + if (w->size) { + if (start) { + start = ALIGN(start, w->align); + if (w->dev) + pnv_ioda_offset_dev(w->dev,flags,start); + else if (w->bus) + pnv_ioda_offset_bus(w->bus,flags,start); + } + if (w->align > *align) + *align = w->align; + } + start += w->size; + kfree(w); + } + *size = start; + + /* Align and setup bridge resources */ + *align = max_t(resource_size_t, *align, + max_t(resource_size_t, min_align, min_balign)); + *size = ALIGN(*size, + max_t(resource_size_t, min_align, min_balign)); + empty: + /* Only setup P2P's, not the PHB itself */ + if (bus->self) { + struct resource *res = bus->resource[bres]; + + if (WARN_ON(res == NULL)) + return; + + /* + * FIXME: We should probably export and call + * pci_bridge_check_ranges() to properly re-initialize + * the PCI portion of the flags here, and to detect + * what the bridge actually supports. + */ + res->start = 0; + res->flags = (*size) ? flags : 0; + res->end = (*size) ? (*size - 1) : 0; + } + + pr_devel("<- CBR %s [%x] *size=%016llx *align=%016llx\n", + bus->self ? pci_name(bus->self) : "root", flags,*size,*align); +} + +static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev) +{ + struct device_node *np; + + np = pci_device_to_OF_node(dev); + if (!np) + return NULL; + return PCI_DN(np); +} + +static void __devinit pnv_ioda_setup_pe_segments(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + unsigned int pe, i; + resource_size_t pos; + struct resource io_res; + struct resource m32_res; + struct pci_bus_region region; + int rc; + + /* Anything not referenced in the device-tree gets PE#0 */ + pe = pdn ? pdn->pe_number : 0; + + /* Calculate the device min/max */ + io_res.start = m32_res.start = (resource_size_t)-1; + io_res.end = m32_res.end = 0; + io_res.flags = IORESOURCE_IO; + m32_res.flags = IORESOURCE_MEM; + + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *r = NULL; + if (dev->resource[i].flags & IORESOURCE_IO) + r = &io_res; + if (dev->resource[i].flags & IORESOURCE_MEM) + r = &m32_res; + if (!r) + continue; + if (dev->resource[i].start < r->start) + r->start = dev->resource[i].start; + if (dev->resource[i].end > r->end) + r->end = dev->resource[i].end; + } + + /* Setup IO segments */ + if (io_res.start < io_res.end) { + pcibios_resource_to_bus(dev, ®ion, &io_res); + pos = region.start; + i = pos / phb->ioda.io_segsize; + while(i < phb->ioda.total_pe && pos <= region.end) { + if (phb->ioda.io_segmap[i]) { + pr_err("%s: Trying to use IO seg #%d which is" + " already used by PE# %d\n", + pci_name(dev), i, + phb->ioda.io_segmap[i]); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + phb->ioda.io_segmap[i] = pe; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, + OPAL_IO_WINDOW_TYPE, + 0, i); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d setting up mapping" + " for IO seg# %d\n", + pci_name(dev), rc, i); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + pos += phb->ioda.io_segsize; + i++; + }; + } + + /* Setup M32 segments */ + if (m32_res.start < m32_res.end) { + pcibios_resource_to_bus(dev, ®ion, &m32_res); + pos = region.start; + i = pos / phb->ioda.m32_segsize; + while(i < phb->ioda.total_pe && pos <= region.end) { + if (phb->ioda.m32_segmap[i]) { + pr_err("%s: Trying to use M32 seg #%d which is" + " already used by PE# %d\n", + pci_name(dev), i, + phb->ioda.m32_segmap[i]); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + phb->ioda.m32_segmap[i] = pe; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, + OPAL_M32_WINDOW_TYPE, + 0, i); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d setting up mapping" + " for M32 seg# %d\n", + pci_name(dev), rc, i); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + pos += phb->ioda.m32_segsize; + i++; + } + } +} + +/* Check if a resource still fits in the total IO or M32 range + * for a given PHB + */ +static int __devinit pnv_ioda_resource_fit(struct pci_controller *hose, + struct resource *r) +{ + struct resource *bounds; + + if (r->flags & IORESOURCE_IO) + bounds = &hose->io_resource; + else if (r->flags & IORESOURCE_MEM) + bounds = &hose->mem_resources[0]; + else + return 1; + + if (r->start >= bounds->start && r->end <= bounds->end) + return 1; + r->flags = 0; + return 0; +} + +static void __devinit pnv_ioda_update_resources(struct pci_bus *bus) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pci_bus *cbus; + struct pci_dev *cdev; + unsigned int i; + + /* We used to clear all device enables here. However it looks like + * clearing MEM enable causes Obsidian (IPR SCS) to go bonkers, + * and shoot fatal errors to the PHB which in turns fences itself + * and we can't recover from that ... yet. So for now, let's leave + * the enables as-is and hope for the best. + */ + + /* Check if bus resources fit in our IO or M32 range */ + for (i = 0; bus->self && (i < 2); i++) { + struct resource *r = bus->resource[i]; + if (r && !pnv_ioda_resource_fit(hose, r)) + pr_err("%s: Bus %d resource %d disabled, no room\n", + pci_name(bus->self), bus->number, i); + } + + /* Update self if it's not a PHB */ + if (bus->self) + pci_setup_bridge(bus); + + /* Update child devices */ + list_for_each_entry(cdev, &bus->devices, bus_list) { + /* Check if resource fits, if not, disabled it */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *r = &cdev->resource[i]; + if (!pnv_ioda_resource_fit(hose, r)) + pr_err("%s: Resource %d disabled, no room\n", + pci_name(cdev), i); + } + + /* Assign segments */ + pnv_ioda_setup_pe_segments(cdev); + + /* Update HW BARs */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) + pci_update_resource(cdev, i); + } + + /* Update child busses */ + list_for_each_entry(cbus, &bus->children, node) + pnv_ioda_update_resources(cbus); +} + +static int __devinit pnv_ioda_alloc_pe(struct pnv_phb *phb) +{ + unsigned long pe; + + do { + pe = find_next_zero_bit(phb->ioda.pe_alloc, + phb->ioda.total_pe, 0); + if (pe >= phb->ioda.total_pe) + return IODA_INVALID_PE; + } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); + + phb->ioda.pe_array[pe].pe_number = pe; + return pe; +} + +static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +{ + WARN_ON(phb->ioda.pe_array[pe].pdev); + + memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); + clear_bit(pe, phb->ioda.pe_alloc); +} + +/* Currently those 2 are only used when MSIs are enabled, this will change + * but in the meantime, we need to protect them to avoid warnings + */ +#ifdef CONFIG_PCI_MSI +static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (!pdn) + return NULL; + if (pdn->pe_number == IODA_INVALID_PE) + return NULL; + return &phb->ioda.pe_array[pdn->pe_number]; +} + +static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev) +{ + struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev); + + while (!pe && dev->bus->self) { + dev = dev->bus->self; + pe = __pnv_ioda_get_one_pe(dev); + if (pe) + pe = pe->bus_pe; + } + return pe; +} +#endif /* CONFIG_PCI_MSI */ + +static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct pci_dev *parent; + uint8_t bcomp, dcomp, fcomp; + long rc, rid_end, rid; + + /* Bus validation ? */ + if (pe->pbus) { + int count; + + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; + parent = pe->pbus->self; + count = pe->pbus->subordinate - pe->pbus->secondary + 1; + switch(count) { + case 1: bcomp = OpalPciBusAll; break; + case 2: bcomp = OpalPciBus7Bits; break; + case 4: bcomp = OpalPciBus6Bits; break; + case 8: bcomp = OpalPciBus5Bits; break; + case 16: bcomp = OpalPciBus4Bits; break; + case 32: bcomp = OpalPciBus3Bits; break; + default: + pr_err("%s: Number of subordinate busses %d" + " unsupported\n", + pci_name(pe->pbus->self), count); + /* Do an exact match only */ + bcomp = OpalPciBusAll; + } + rid_end = pe->rid + (count << 8); + } else { + parent = pe->pdev->bus->self; + bcomp = OpalPciBusAll; + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; + rid_end = pe->rid + 1; + } + + /* Associate PE in PELT */ + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, + bcomp, dcomp, fcomp, OPAL_MAP_PE); + if (rc) { + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + return -ENXIO; + } + opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Add to all parents PELT-V */ + while (parent) { + struct pci_dn *pdn = pnv_ioda_get_pdn(parent); + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + /* Setup reverse map */ + for (rid = pe->rid; rid < rid_end; rid++) + phb->ioda.pe_rmap[rid] = pe->pe_number; + + /* Setup one MVTs on IODA1 */ + if (phb->type == PNV_PHB_IODA1) { + pe->mve_number = pe->pe_number; + rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, + pe->pe_number); + if (rc) { + pe_err(pe, "OPAL error %ld setting up MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } else { + rc = opal_pci_set_mve_enable(phb->opal_id, + pe->mve_number, OPAL_ENABLE_MVE); + if (rc) { + pe_err(pe, "OPAL error %ld enabling MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } + } + } else if (phb->type == PNV_PHB_IODA2) + pe->mve_number = 0; + + return 0; +} + +static void __devinit pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct pnv_ioda_pe *lpe; + + list_for_each_entry(lpe, &phb->ioda.pe_list, link) { + if (lpe->dma_weight < pe->dma_weight) { + list_add_tail(&pe->link, &lpe->link); + return; + } + } + list_add_tail(&pe->link, &phb->ioda.pe_list); +} + +static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) +{ + /* This is quite simplistic. The "base" weight of a device + * is 10. 0 means no DMA is to be accounted for it. + */ + + /* If it's a bridge, no DMA */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return 0; + + /* Reduce the weight of slow USB controllers */ + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || + dev->class == PCI_CLASS_SERIAL_USB_OHCI || + dev->class == PCI_CLASS_SERIAL_USB_EHCI) + return 3; + + /* Increase the weight of RAID (includes Obsidian) */ + if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) + return 15; + + /* Default */ + return 10; +} + +static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + struct pnv_ioda_pe *pe; + int pe_num; + + if (!pdn) { + pr_err("%s: Device tree node not associated properly\n", + pci_name(dev)); + return NULL; + } + if (pdn->pe_number != IODA_INVALID_PE) + return NULL; + + /* PE#0 has been pre-set */ + if (dev->bus->number == 0) + pe_num = 0; + else + pe_num = pnv_ioda_alloc_pe(phb); + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available, disabling device\n", + pci_name(dev)); + return NULL; + } + + /* NOTE: We get only one ref to the pci_dev for the pdn, not for the + * pointer in the PE data structure, both should be destroyed at the + * same time. However, this needs to be looked at more closely again + * once we actually start removing things (Hotplug, SR-IOV, ...) + * + * At some point we want to remove the PDN completely anyways + */ + pe = &phb->ioda.pe_array[pe_num]; + pci_dev_get(dev); + pdn->pcidev = dev; + pdn->pe_number = pe_num; + pe->pdev = dev; + pe->pbus = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = dev->bus->number << 8 | pdn->devfn; + + pe_info(pe, "Associated device to PE\n"); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pdn->pe_number = IODA_INVALID_PE; + pe->pdev = NULL; + pci_dev_put(dev); + return NULL; + } + + /* Assign a DMA weight to the device */ + pe->dma_weight = pnv_ioda_dma_weight(dev); + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); + + return pe; +} + +static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (pdn == NULL) { + pr_warn("%s: No device node associated with device !\n", + pci_name(dev)); + continue; + } + pci_dev_get(dev); + pdn->pcidev = dev; + pdn->pe_number = pe->pe_number; + pe->dma_weight += pnv_ioda_dma_weight(dev); + if (dev->subordinate) + pnv_ioda_setup_same_PE(dev->subordinate, pe); + } +} + +static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, + struct pnv_ioda_pe *ppe) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_bus *bus = dev->subordinate; + struct pnv_ioda_pe *pe; + int pe_num; + + if (!bus) { + pr_warning("%s: Bridge without a subordinate bus !\n", + pci_name(dev)); + return; + } + pe_num = pnv_ioda_alloc_pe(phb); + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available, disabling bus\n", + pci_name(dev)); + return; + } + + pe = &phb->ioda.pe_array[pe_num]; + ppe->bus_pe = pe; + pe->pbus = bus; + pe->pdev = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = bus->secondary << 8; + pe->dma_weight = 0; + + pe_info(pe, "Secondary busses %d..%d associated with PE\n", + bus->secondary, bus->subordinate); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pe->pbus = NULL; + return; + } + + /* Associate it with all child devices */ + pnv_ioda_setup_same_PE(bus, pe); + + /* Account for one DMA PE if at least one DMA capable device exist + * below the bridge + */ + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); +} + +static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus) +{ + struct pci_dev *dev; + struct pnv_ioda_pe *pe; + + list_for_each_entry(dev, &bus->devices, bus_list) { + pe = pnv_ioda_setup_dev_PE(dev); + if (pe == NULL) + continue; + /* Leaving the PCIe domain ... single PE# */ + if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) + pnv_ioda_setup_bus_PE(dev, pe); + else if (dev->subordinate) + pnv_ioda_setup_PEs(dev->subordinate); + } +} + +static void __devinit pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, + struct pci_dev *dev) +{ + /* We delay DMA setup after we have assigned all PE# */ +} + +static void __devinit pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, + struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, &pe->tce32_table); + if (dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + +static void __devinit pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + unsigned int base, + unsigned int segs) +{ + + struct page *tce_mem = NULL; + const __be64 *swinvp; + struct iommu_table *tbl; + unsigned int i; + int64_t rc; + void *addr; + + /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ +#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) + + /* XXX FIXME: Handle 64-bit only DMA devices */ + /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ + /* XXX FIXME: Allocate multi-level tables on PHB3 */ + + /* We shouldn't already have a 32-bit DMA associated */ + if (WARN_ON(pe->tce32_seg >= 0)) + return; + + /* Grab a 32-bit TCE table */ + pe->tce32_seg = base; + pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", + (base << 28), ((base + segs) << 28) - 1); + + /* XXX Currently, we allocate one big contiguous table for the + * TCEs. We only really need one chunk per 256M of TCE space + * (ie per segment) but that's an optimization for later, it + * requires some added smarts with our get/put_tce implementation + */ + tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, + get_order(TCE32_TABLE_SIZE * segs)); + if (!tce_mem) { + pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, TCE32_TABLE_SIZE * segs); + + /* Configure HW */ + for (i = 0; i < segs; i++) { + rc = opal_pci_map_pe_dma_window(phb->opal_id, + pe->pe_number, + base + i, 1, + __pa(addr) + TCE32_TABLE_SIZE * i, + TCE32_TABLE_SIZE, 0x1000); + if (rc) { + pe_err(pe, " Failed to configure 32-bit TCE table," + " err %ld\n", rc); + goto fail; + } + } + + /* Setup linux iommu table */ + tbl = &pe->tce32_table; + pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, + base << 28); + + /* OPAL variant of P7IOC SW invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data + * to or. Since the bus is only printed out on table free + * errors, and on the first pass the data will be a relative + * bus number, print that out instead. + */ + tbl->it_busno = 0; + tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE + | TCE_PCI_SWINV_PAIR; + } + iommu_init_table(tbl, phb->hose->node); + + if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe->pbus); + + return; + fail: + /* XXX Failure: Try to fallback to 64-bit only ? */ + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + if (tce_mem) + __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); +} + +static void __devinit pnv_ioda_setup_dma(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + unsigned int residual, remaining, segs, tw, base; + struct pnv_ioda_pe *pe; + + /* If we have more PE# than segments available, hand out one + * per PE until we run out and let the rest fail. If not, + * then we assign at least one segment per PE, plus more based + * on the amount of devices under that PE + */ + if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) + residual = 0; + else + residual = phb->ioda.tce32_count - + phb->ioda.dma_pe_count; + + pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", + hose->global_number, phb->ioda.tce32_count); + pr_info("PCI: %d PE# for a total weight of %d\n", + phb->ioda.dma_pe_count, phb->ioda.dma_weight); + + /* Walk our PE list and configure their DMA segments, hand them + * out one base segment plus any residual segments based on + * weight + */ + remaining = phb->ioda.tce32_count; + tw = phb->ioda.dma_weight; + base = 0; + list_for_each_entry(pe, &phb->ioda.pe_list, link) { + if (!pe->dma_weight) + continue; + if (!remaining) { + pe_warn(pe, "No DMA32 resources available\n"); + continue; + } + segs = 1; + if (residual) { + segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; + if (segs > remaining) + segs = remaining; + } + pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", + pe->dma_weight, segs); + pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + remaining -= segs; + base += segs; + } +} + +#ifdef CONFIG_PCI_MSI +static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int is_64, + struct msi_msg *msg) +{ + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); + unsigned int xive_num = hwirq - phb->msi_base; + uint64_t addr64; + uint32_t addr32, data; + int rc; + + /* No PE assigned ? bail out ... no MSI for you ! */ + if (pe == NULL) + return -ENXIO; + + /* Check if we have an MVE */ + if (pe->mve_number < 0) + return -ENXIO; + + /* Assign XIVE to PE */ + rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); + if (rc) { + pr_warn("%s: OPAL error %d setting XIVE %d PE\n", + pci_name(dev), rc, xive_num); + return -EIO; + } + + if (is_64) { + rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, + &addr64, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = addr64 >> 32; + msg->address_lo = addr64 & 0xfffffffful; + } else { + rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, + &addr32, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = 0; + msg->address_lo = addr32; + } + msg->data = data; + + pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," + " address=%x_%08x data=%x PE# %d\n", + pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, + msg->address_hi, msg->address_lo, data, pe->pe_number); + + return 0; +} + +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +{ + unsigned int bmap_size; + const __be32 *prop = of_get_property(phb->hose->dn, + "ibm,opal-msi-ranges", NULL); + if (!prop) { + /* BML Fallback */ + prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); + } + if (!prop) + return; + + phb->msi_base = be32_to_cpup(prop); + phb->msi_count = be32_to_cpup(prop + 1); + bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long); + phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL); + if (!phb->msi_map) { + pr_err("PCI %d: Failed to allocate MSI bitmap !\n", + phb->hose->global_number); + return; + } + phb->msi_setup = pnv_pci_ioda_msi_setup; + phb->msi32_support = 1; + pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", + phb->msi_count, phb->msi_base); +} +#else +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +/* This is the starting point of our IODA specific resource + * allocation process + */ +static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose) +{ + resource_size_t size, align; + struct pci_bus *child; + + /* Associate PEs per functions */ + pnv_ioda_setup_PEs(hose->bus); + + /* Calculate all resources */ + pnv_ioda_calc_bus(hose->bus, IORESOURCE_IO, &size, &align); + pnv_ioda_calc_bus(hose->bus, IORESOURCE_MEM, &size, &align); + + /* Apply then to HW */ + pnv_ioda_update_resources(hose->bus); + + /* Setup DMA */ + pnv_ioda_setup_dma(hose->private_data); + + /* Configure PCI Express settings */ + list_for_each_entry(child, &hose->bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + pcie_bus_configure_settings(child, self->pcie_mpss); + } +} + +/* Prevent enabling devices for which we couldn't properly + * assign a PE + */ +static int __devinit pnv_pci_enable_device_hook(struct pci_dev *dev) +{ + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return -EINVAL; + return 0; +} + +static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, + u32 devfn) +{ + return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; +} + +void __init pnv_pci_init_ioda1_phb(struct device_node *np) +{ + struct pci_controller *hose; + static int primary = 1; + struct pnv_phb *phb; + unsigned long size, m32map_off, iomap_off, pemap_off; + const u64 *prop64; + u64 phb_id; + void *aux; + long rc; + + pr_info(" Initializing IODA OPAL PHB %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-phbid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-phbid\" property !\n"); + return; + } + phb_id = be64_to_cpup(prop64); + pr_debug(" PHB-ID : 0x%016llx\n", phb_id); + + phb = alloc_bootmem(sizeof(struct pnv_phb)); + if (phb) { + memset(phb, 0, sizeof(struct pnv_phb)); + phb->hose = hose = pcibios_alloc_controller(np); + } + if (!phb || !phb->hose) { + pr_err("PCI: Failed to allocate PCI controller for %s\n", + np->full_name); + return; + } + + spin_lock_init(&phb->lock); + /* XXX Use device-tree */ + hose->first_busno = 0; + hose->last_busno = 0xff; + hose->private_data = phb; + phb->opal_id = phb_id; + phb->type = PNV_PHB_IODA1; + + /* Detect specific models for error handling */ + if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) + phb->model = PNV_PHB_MODEL_P7IOC; + else + phb->model = PNV_PHB_MODEL_UNKNOWN; + + /* We parse "ranges" now since we need to deduce the register base + * from the IO base + */ + pci_process_bridge_OF_ranges(phb->hose, np, primary); + primary = 0; + + /* Magic formula from Milton */ + phb->regs = of_iomap(np, 0); + if (phb->regs == NULL) + pr_err(" Failed to map registers !\n"); + + + /* XXX This is hack-a-thon. This needs to be changed so that: + * - we obtain stuff like PE# etc... from device-tree + * - we properly re-allocate M32 ourselves + * (the OFW one isn't very good) + */ + + /* Initialize more IODA stuff */ + phb->ioda.total_pe = 128; + + phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); + /* OFW Has already off top 64k of M32 space (MSI space) */ + phb->ioda.m32_size += 0x10000; + + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; + phb->ioda.m32_pci_base = hose->mem_resources[0].start - + hose->pci_mem_offset; + phb->ioda.io_size = hose->pci_io_size; + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; + phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + + /* Allocate aux data & arrays */ + size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + m32map_off = size; + size += phb->ioda.total_pe; + iomap_off = size; + size += phb->ioda.total_pe; + pemap_off = size; + size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); + aux = alloc_bootmem(size); + memset(aux, 0, size); + phb->ioda.pe_alloc = aux; + phb->ioda.m32_segmap = aux + m32map_off; + phb->ioda.io_segmap = aux + iomap_off; + phb->ioda.pe_array = aux + pemap_off; + set_bit(0, phb->ioda.pe_alloc); + + INIT_LIST_HEAD(&phb->ioda.pe_list); + + /* Calculate how many 32-bit TCE segments we have */ + phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + + /* Clear unusable m64 */ + hose->mem_resources[1].flags = 0; + hose->mem_resources[1].start = 0; + hose->mem_resources[1].end = 0; + hose->mem_resources[2].flags = 0; + hose->mem_resources[2].start = 0; + hose->mem_resources[2].end = 0; + +#if 0 + rc = opal_pci_set_phb_mem_window(opal->phb_id, + window_type, + window_num, + starting_real_address, + starting_pci_address, + segment_size); +#endif + + pr_info(" %d PE's M32: 0x%x [segment=0x%x] IO: 0x%x [segment=0x%x]\n", + phb->ioda.total_pe, + phb->ioda.m32_size, phb->ioda.m32_segsize, + phb->ioda.io_size, phb->ioda.io_segsize); + + if (phb->regs) { + pr_devel(" BUID = 0x%016llx\n", in_be64(phb->regs + 0x100)); + pr_devel(" PHB2_CR = 0x%016llx\n", in_be64(phb->regs + 0x160)); + pr_devel(" IO_BAR = 0x%016llx\n", in_be64(phb->regs + 0x170)); + pr_devel(" IO_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x178)); + pr_devel(" IO_SAR = 0x%016llx\n", in_be64(phb->regs + 0x180)); + pr_devel(" M32_BAR = 0x%016llx\n", in_be64(phb->regs + 0x190)); + pr_devel(" M32_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x198)); + pr_devel(" M32_SAR = 0x%016llx\n", in_be64(phb->regs + 0x1a0)); + } + phb->hose->ops = &pnv_pci_ops; + + /* Setup RID -> PE mapping function */ + phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; + + /* Setup TCEs */ + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; + + /* Setup MSI support */ + pnv_pci_init_ioda_msis(phb); + + /* We set both PCI_PROBE_ONLY and PCI_REASSIGN_ALL_RSRC. This is an + * odd combination which essentially means that we skip all resource + * fixups and assignments in the generic code, and do it all + * ourselves here + */ + ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb; + ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; + pci_add_flags(PCI_PROBE_ONLY | PCI_REASSIGN_ALL_RSRC); + + /* Reset IODA tables to a clean state */ + rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); + if (rc) + pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); + opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE); +} + +void __init pnv_pci_init_ioda_hub(struct device_node *np) +{ + struct device_node *phbn; + const u64 *prop64; + u64 hub_id; + + pr_info("Probing IODA IO-Hub %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-hubid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-hubid\" property !\n"); + return; + } + hub_id = be64_to_cpup(prop64); + pr_devel(" HUB-ID : 0x%016llx\n", hub_id); + + /* Count child PHBs */ + for_each_child_of_node(np, phbn) { + /* Look for IODA1 PHBs */ + if (of_device_is_compatible(phbn, "ibm,ioda-phb")) + pnv_pci_init_ioda1_phb(phbn); + } +} diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c new file mode 100644 index 00000000..26496777 --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -0,0 +1,235 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Currently supports only P5IOC2 + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/abs_addr.h> + +#include "powernv.h" +#include "pci.h" + +/* For now, use a fixed amount of TCE memory for each p5ioc2 + * hub, 16M will do + */ +#define P5IOC2_TCE_MEMORY 0x01000000 + +#ifdef CONFIG_PCI_MSI +static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int is_64, + struct msi_msg *msg) +{ + if (WARN_ON(!is_64)) + return -ENXIO; + msg->data = hwirq - phb->msi_base; + msg->address_hi = 0x10000000; + msg->address_lo = 0; + + return 0; +} + +static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) +{ + unsigned int bmap_size; + const __be32 *prop = of_get_property(phb->hose->dn, + "ibm,opal-msi-ranges", NULL); + if (!prop) + return; + + /* Don't do MSI's on p5ioc2 PCI-X are they are not properly + * verified in HW + */ + if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix")) + return; + phb->msi_base = be32_to_cpup(prop); + phb->msi_count = be32_to_cpup(prop + 1); + bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long); + phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL); + if (!phb->msi_map) { + pr_err("PCI %d: Failed to allocate MSI bitmap !\n", + phb->hose->global_number); + return; + } + phb->msi_setup = pnv_pci_p5ioc2_msi_setup; + phb->msi32_support = 0; + pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", + phb->msi_count, phb->msi_base); +} +#else +static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +static void __devinit pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, + struct pci_dev *pdev) +{ + if (phb->p5ioc2.iommu_table.it_map == NULL) + iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); + + set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); +} + +static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, + void *tce_mem, u64 tce_size) +{ + struct pnv_phb *phb; + const u64 *prop64; + u64 phb_id; + int64_t rc; + static int primary = 1; + + pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-phbid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-phbid\" property !\n"); + return; + } + phb_id = be64_to_cpup(prop64); + pr_devel(" PHB-ID : 0x%016llx\n", phb_id); + pr_devel(" TCE AT : 0x%016lx\n", __pa(tce_mem)); + pr_devel(" TCE SZ : 0x%016llx\n", tce_size); + + rc = opal_pci_set_phb_tce_memory(phb_id, __pa(tce_mem), tce_size); + if (rc != OPAL_SUCCESS) { + pr_err(" Failed to set TCE memory, OPAL error %lld\n", rc); + return; + } + + phb = alloc_bootmem(sizeof(struct pnv_phb)); + if (phb) { + memset(phb, 0, sizeof(struct pnv_phb)); + phb->hose = pcibios_alloc_controller(np); + } + if (!phb || !phb->hose) { + pr_err(" Failed to allocate PCI controller\n"); + return; + } + + spin_lock_init(&phb->lock); + phb->hose->first_busno = 0; + phb->hose->last_busno = 0xff; + phb->hose->private_data = phb; + phb->opal_id = phb_id; + phb->type = PNV_PHB_P5IOC2; + phb->model = PNV_PHB_MODEL_P5IOC2; + + phb->regs = of_iomap(np, 0); + + if (phb->regs == NULL) + pr_err(" Failed to map registers !\n"); + else { + pr_devel(" P_BUID = 0x%08x\n", in_be32(phb->regs + 0x100)); + pr_devel(" P_IOSZ = 0x%08x\n", in_be32(phb->regs + 0x1b0)); + pr_devel(" P_IO_ST = 0x%08x\n", in_be32(phb->regs + 0x1e0)); + pr_devel(" P_MEM1_H = 0x%08x\n", in_be32(phb->regs + 0x1a0)); + pr_devel(" P_MEM1_L = 0x%08x\n", in_be32(phb->regs + 0x190)); + pr_devel(" P_MSZ1_L = 0x%08x\n", in_be32(phb->regs + 0x1c0)); + pr_devel(" P_MEM_ST = 0x%08x\n", in_be32(phb->regs + 0x1d0)); + pr_devel(" P_MEM2_H = 0x%08x\n", in_be32(phb->regs + 0x2c0)); + pr_devel(" P_MEM2_L = 0x%08x\n", in_be32(phb->regs + 0x2b0)); + pr_devel(" P_MSZ2_H = 0x%08x\n", in_be32(phb->regs + 0x2d0)); + pr_devel(" P_MSZ2_L = 0x%08x\n", in_be32(phb->regs + 0x2e0)); + } + + /* Interpret the "ranges" property */ + /* This also maps the I/O region and sets isa_io/mem_base */ + pci_process_bridge_OF_ranges(phb->hose, np, primary); + primary = 0; + + phb->hose->ops = &pnv_pci_ops; + + /* Setup MSI support */ + pnv_pci_init_p5ioc2_msis(phb); + + /* Setup TCEs */ + phb->dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup; + pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table, + tce_mem, tce_size, 0); +} + +void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) +{ + struct device_node *phbn; + const u64 *prop64; + u64 hub_id; + void *tce_mem; + uint64_t tce_per_phb; + int64_t rc; + int phb_count = 0; + + pr_info("Probing p5ioc2 IO-Hub %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-hubid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-hubid\" property !\n"); + return; + } + hub_id = be64_to_cpup(prop64); + pr_info(" HUB-ID : 0x%016llx\n", hub_id); + + /* Currently allocate 16M of TCE memory for every Hub + * + * XXX TODO: Make it chip local if possible + */ + tce_mem = __alloc_bootmem(P5IOC2_TCE_MEMORY, P5IOC2_TCE_MEMORY, + __pa(MAX_DMA_ADDRESS)); + if (!tce_mem) { + pr_err(" Failed to allocate TCE Memory !\n"); + return; + } + pr_debug(" TCE : 0x%016lx..0x%016lx\n", + __pa(tce_mem), __pa(tce_mem) + P5IOC2_TCE_MEMORY - 1); + rc = opal_pci_set_hub_tce_memory(hub_id, __pa(tce_mem), + P5IOC2_TCE_MEMORY); + if (rc != OPAL_SUCCESS) { + pr_err(" Failed to allocate TCE memory, OPAL error %lld\n", rc); + return; + } + + /* Count child PHBs */ + for_each_child_of_node(np, phbn) { + if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || + of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) + phb_count++; + } + + /* Calculate how much TCE space we can give per PHB */ + tce_per_phb = __rounddown_pow_of_two(P5IOC2_TCE_MEMORY / phb_count); + pr_info(" Allocating %lld MB of TCE memory per PHB\n", + tce_per_phb >> 20); + + /* Initialize PHBs */ + for_each_child_of_node(np, phbn) { + if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || + of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) { + pnv_pci_init_p5ioc2_phb(phbn, tce_mem, tce_per_phb); + tce_mem += tce_per_phb; + } + } +} diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c new file mode 100644 index 00000000..be3cfc5c --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci.c @@ -0,0 +1,609 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Currently supports only P5IOC2 + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/abs_addr.h> +#include <asm/firmware.h> + +#include "powernv.h" +#include "pci.h" + +/* Delay in usec */ +#define PCI_RESET_DELAY_US 3000000 + +#define cfg_dbg(fmt...) do { } while(0) +//#define cfg_dbg(fmt...) printk(fmt) + +#ifdef CONFIG_PCI_MSI +static int pnv_msi_check_device(struct pci_dev* pdev, int nvec, int type) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + + return (phb && phb->msi_map) ? 0 : -ENODEV; +} + +static unsigned int pnv_get_one_msi(struct pnv_phb *phb) +{ + unsigned long flags; + unsigned int id, rc; + + spin_lock_irqsave(&phb->lock, flags); + + id = find_next_zero_bit(phb->msi_map, phb->msi_count, phb->msi_next); + if (id >= phb->msi_count && phb->msi_next) + id = find_next_zero_bit(phb->msi_map, phb->msi_count, 0); + if (id >= phb->msi_count) { + rc = 0; + goto out; + } + __set_bit(id, phb->msi_map); + rc = id + phb->msi_base; +out: + spin_unlock_irqrestore(&phb->lock, flags); + return rc; +} + +static void pnv_put_msi(struct pnv_phb *phb, unsigned int hwirq) +{ + unsigned long flags; + unsigned int id; + + if (WARN_ON(hwirq < phb->msi_base || + hwirq >= (phb->msi_base + phb->msi_count))) + return; + id = hwirq - phb->msi_base; + + spin_lock_irqsave(&phb->lock, flags); + __clear_bit(id, phb->msi_map); + spin_unlock_irqrestore(&phb->lock, flags); +} + +static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct msi_desc *entry; + struct msi_msg msg; + unsigned int hwirq, virq; + int rc; + + if (WARN_ON(!phb)) + return -ENODEV; + + list_for_each_entry(entry, &pdev->msi_list, list) { + if (!entry->msi_attrib.is_64 && !phb->msi32_support) { + pr_warn("%s: Supports only 64-bit MSIs\n", + pci_name(pdev)); + return -ENXIO; + } + hwirq = pnv_get_one_msi(phb); + if (!hwirq) { + pr_warn("%s: Failed to find a free MSI\n", + pci_name(pdev)); + return -ENOSPC; + } + virq = irq_create_mapping(NULL, hwirq); + if (virq == NO_IRQ) { + pr_warn("%s: Failed to map MSI to linux irq\n", + pci_name(pdev)); + pnv_put_msi(phb, hwirq); + return -ENOMEM; + } + rc = phb->msi_setup(phb, pdev, hwirq, entry->msi_attrib.is_64, + &msg); + if (rc) { + pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); + irq_dispose_mapping(virq); + pnv_put_msi(phb, hwirq); + return rc; + } + irq_set_msi_desc(virq, entry); + write_msi_msg(virq, &msg); + } + return 0; +} + +static void pnv_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct msi_desc *entry; + + if (WARN_ON(!phb)) + return; + + list_for_each_entry(entry, &pdev->msi_list, list) { + if (entry->irq == NO_IRQ) + continue; + irq_set_msi_desc(entry->irq, NULL); + pnv_put_msi(phb, virq_to_hw(entry->irq)); + irq_dispose_mapping(entry->irq); + } +} +#endif /* CONFIG_PCI_MSI */ + +static void pnv_pci_dump_p7ioc_diag_data(struct pnv_phb *phb) +{ + struct OpalIoP7IOCPhbErrorData *data = &phb->diag.p7ioc; + int i; + + pr_info("PHB %d diagnostic data:\n", phb->hose->global_number); + + pr_info(" brdgCtl = 0x%08x\n", data->brdgCtl); + + pr_info(" portStatusReg = 0x%08x\n", data->portStatusReg); + pr_info(" rootCmplxStatus = 0x%08x\n", data->rootCmplxStatus); + pr_info(" busAgentStatus = 0x%08x\n", data->busAgentStatus); + + pr_info(" deviceStatus = 0x%08x\n", data->deviceStatus); + pr_info(" slotStatus = 0x%08x\n", data->slotStatus); + pr_info(" linkStatus = 0x%08x\n", data->linkStatus); + pr_info(" devCmdStatus = 0x%08x\n", data->devCmdStatus); + pr_info(" devSecStatus = 0x%08x\n", data->devSecStatus); + + pr_info(" rootErrorStatus = 0x%08x\n", data->rootErrorStatus); + pr_info(" uncorrErrorStatus = 0x%08x\n", data->uncorrErrorStatus); + pr_info(" corrErrorStatus = 0x%08x\n", data->corrErrorStatus); + pr_info(" tlpHdr1 = 0x%08x\n", data->tlpHdr1); + pr_info(" tlpHdr2 = 0x%08x\n", data->tlpHdr2); + pr_info(" tlpHdr3 = 0x%08x\n", data->tlpHdr3); + pr_info(" tlpHdr4 = 0x%08x\n", data->tlpHdr4); + pr_info(" sourceId = 0x%08x\n", data->sourceId); + + pr_info(" errorClass = 0x%016llx\n", data->errorClass); + pr_info(" correlator = 0x%016llx\n", data->correlator); + + pr_info(" p7iocPlssr = 0x%016llx\n", data->p7iocPlssr); + pr_info(" p7iocCsr = 0x%016llx\n", data->p7iocCsr); + pr_info(" lemFir = 0x%016llx\n", data->lemFir); + pr_info(" lemErrorMask = 0x%016llx\n", data->lemErrorMask); + pr_info(" lemWOF = 0x%016llx\n", data->lemWOF); + pr_info(" phbErrorStatus = 0x%016llx\n", data->phbErrorStatus); + pr_info(" phbFirstErrorStatus = 0x%016llx\n", data->phbFirstErrorStatus); + pr_info(" phbErrorLog0 = 0x%016llx\n", data->phbErrorLog0); + pr_info(" phbErrorLog1 = 0x%016llx\n", data->phbErrorLog1); + pr_info(" mmioErrorStatus = 0x%016llx\n", data->mmioErrorStatus); + pr_info(" mmioFirstErrorStatus = 0x%016llx\n", data->mmioFirstErrorStatus); + pr_info(" mmioErrorLog0 = 0x%016llx\n", data->mmioErrorLog0); + pr_info(" mmioErrorLog1 = 0x%016llx\n", data->mmioErrorLog1); + pr_info(" dma0ErrorStatus = 0x%016llx\n", data->dma0ErrorStatus); + pr_info(" dma0FirstErrorStatus = 0x%016llx\n", data->dma0FirstErrorStatus); + pr_info(" dma0ErrorLog0 = 0x%016llx\n", data->dma0ErrorLog0); + pr_info(" dma0ErrorLog1 = 0x%016llx\n", data->dma0ErrorLog1); + pr_info(" dma1ErrorStatus = 0x%016llx\n", data->dma1ErrorStatus); + pr_info(" dma1FirstErrorStatus = 0x%016llx\n", data->dma1FirstErrorStatus); + pr_info(" dma1ErrorLog0 = 0x%016llx\n", data->dma1ErrorLog0); + pr_info(" dma1ErrorLog1 = 0x%016llx\n", data->dma1ErrorLog1); + + for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { + if ((data->pestA[i] >> 63) == 0 && + (data->pestB[i] >> 63) == 0) + continue; + pr_info(" PE[%3d] PESTA = 0x%016llx\n", i, data->pestA[i]); + pr_info(" PESTB = 0x%016llx\n", data->pestB[i]); + } +} + +static void pnv_pci_dump_phb_diag_data(struct pnv_phb *phb) +{ + switch(phb->model) { + case PNV_PHB_MODEL_P7IOC: + pnv_pci_dump_p7ioc_diag_data(phb); + break; + default: + pr_warning("PCI %d: Can't decode this PHB diag data\n", + phb->hose->global_number); + } +} + +static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) +{ + unsigned long flags, rc; + int has_diag; + + spin_lock_irqsave(&phb->lock, flags); + + rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + has_diag = (rc == OPAL_SUCCESS); + + rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + if (rc) { + pr_warning("PCI %d: Failed to clear EEH freeze state" + " for PE#%d, err %ld\n", + phb->hose->global_number, pe_no, rc); + + /* For now, let's only display the diag buffer when we fail to clear + * the EEH status. We'll do more sensible things later when we have + * proper EEH support. We need to make sure we don't pollute ourselves + * with the normal errors generated when probing empty slots + */ + if (has_diag) + pnv_pci_dump_phb_diag_data(phb); + else + pr_warning("PCI %d: No diag data available\n", + phb->hose->global_number); + } + + spin_unlock_irqrestore(&phb->lock, flags); +} + +static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus, + u32 bdfn) +{ + s64 rc; + u8 fstate; + u16 pcierr; + u32 pe_no; + + /* Get PE# if we support IODA */ + pe_no = phb->bdfn_to_pe ? phb->bdfn_to_pe(phb, bus, bdfn & 0xff) : 0; + + /* Read freeze status */ + rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr, + NULL); + if (rc) { + pr_warning("PCI %d: Failed to read EEH status for PE#%d," + " err %lld\n", phb->hose->global_number, pe_no, rc); + return; + } + cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n", + bdfn, pe_no, fstate); + if (fstate != 0) + pnv_pci_handle_eeh_config(phb, pe_no); +} + +static int pnv_pci_read_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 *val) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; + s64 rc; + + if (hose == NULL) + return PCIBIOS_DEVICE_NOT_FOUND; + + switch (size) { + case 1: { + u8 v8; + rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8); + *val = (rc == OPAL_SUCCESS) ? v8 : 0xff; + break; + } + case 2: { + u16 v16; + rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where, + &v16); + *val = (rc == OPAL_SUCCESS) ? v16 : 0xffff; + break; + } + case 4: { + u32 v32; + rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32); + *val = (rc == OPAL_SUCCESS) ? v32 : 0xffffffff; + break; + } + default: + return PCIBIOS_FUNC_NOT_SUPPORTED; + } + cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n", + bus->number, devfn, where, size, *val); + + /* Check if the PHB got frozen due to an error (no response) */ + pnv_pci_config_check_eeh(phb, bus, bdfn); + + return PCIBIOS_SUCCESSFUL; +} + +static int pnv_pci_write_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 val) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; + + if (hose == NULL) + return PCIBIOS_DEVICE_NOT_FOUND; + + cfg_dbg("pnv_pci_write_config bus: %x devfn: %x +%x/%x -> %08x\n", + bus->number, devfn, where, size, val); + switch (size) { + case 1: + opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); + break; + case 2: + opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val); + break; + case 4: + opal_pci_config_write_word(phb->opal_id, bdfn, where, val); + break; + default: + return PCIBIOS_FUNC_NOT_SUPPORTED; + } + /* Check if the PHB got frozen due to an error (no response) */ + pnv_pci_config_check_eeh(phb, bus, bdfn); + + return PCIBIOS_SUCCESSFUL; +} + +struct pci_ops pnv_pci_ops = { + .read = pnv_pci_read_config, + .write = pnv_pci_write_config, +}; + + +static void pnv_tce_invalidate(struct iommu_table *tbl, + u64 *startp, u64 *endp) +{ + u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; + unsigned long start, end, inc; + + start = __pa(startp); + end = __pa(endp); + + + /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ + if (tbl->it_busno) { + start <<= 12; + end <<= 12; + inc = 128 << 12; + start |= tbl->it_busno; + end |= tbl->it_busno; + } + /* p7ioc-style invalidation, 2 TCEs per write */ + else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { + start |= (1ull << 63); + end |= (1ull << 63); + inc = 16; + } + /* Default (older HW) */ + else + inc = 128; + + end |= inc - 1; /* round up end to be different than start */ + + mb(); /* Ensure above stores are visible */ + while (start <= end) { + __raw_writeq(start, invalidate); + start += inc; + } + /* The iommu layer will do another mb() for us on build() and + * we don't care on free() + */ +} + + +static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + u64 proto_tce; + u64 *tcep, *tces; + u64 rpn; + + proto_tce = TCE_PCI_READ; // Read allowed + + if (direction != DMA_TO_DEVICE) + proto_tce |= TCE_PCI_WRITE; + + tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; + rpn = __pa(uaddr) >> TCE_SHIFT; + + while (npages--) + *(tcep++) = proto_tce | (rpn++ << TCE_RPN_SHIFT); + + /* Some implementations won't cache invalid TCEs and thus may not + * need that flush. We'll probably turn it_type into a bit mask + * of flags if that becomes the case + */ + if (tbl->it_type & TCE_PCI_SWINV_CREATE) + pnv_tce_invalidate(tbl, tces, tcep - 1); + + return 0; +} + +static void pnv_tce_free(struct iommu_table *tbl, long index, long npages) +{ + u64 *tcep, *tces; + + tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; + + while (npages--) + *(tcep++) = 0; + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_tce_invalidate(tbl, tces, tcep - 1); +} + +void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset) +{ + tbl->it_blocksize = 16; + tbl->it_base = (unsigned long)tce_mem; + tbl->it_offset = dma_offset >> IOMMU_PAGE_SHIFT; + tbl->it_index = 0; + tbl->it_size = tce_size >> 3; + tbl->it_busno = 0; + tbl->it_type = TCE_PCI; +} + +static struct iommu_table * __devinit +pnv_pci_setup_bml_iommu(struct pci_controller *hose) +{ + struct iommu_table *tbl; + const __be64 *basep, *swinvp; + const __be32 *sizep; + + basep = of_get_property(hose->dn, "linux,tce-base", NULL); + sizep = of_get_property(hose->dn, "linux,tce-size", NULL); + if (basep == NULL || sizep == NULL) { + pr_err("PCI: %s has missing tce entries !\n", + hose->dn->full_name); + return NULL; + } + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose->node); + if (WARN_ON(!tbl)) + return NULL; + pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), + be32_to_cpup(sizep), 0); + iommu_init_table(tbl, hose->node); + + /* Deal with SW invalidated TCEs when needed (BML way) */ + swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info", + NULL); + if (swinvp) { + tbl->it_busno = swinvp[1]; + tbl->it_index = (unsigned long)ioremap(swinvp[0], 8); + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; + } + return tbl; +} + +static void __devinit pnv_pci_dma_fallback_setup(struct pci_controller *hose, + struct pci_dev *pdev) +{ + struct device_node *np = pci_bus_to_OF_node(hose->bus); + struct pci_dn *pdn; + + if (np == NULL) + return; + pdn = PCI_DN(np); + if (!pdn->iommu_table) + pdn->iommu_table = pnv_pci_setup_bml_iommu(hose); + if (!pdn->iommu_table) + return; + set_iommu_table_base(&pdev->dev, pdn->iommu_table); +} + +static void __devinit pnv_pci_dma_dev_setup(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + + /* If we have no phb structure, try to setup a fallback based on + * the device-tree (RTAS PCI for example) + */ + if (phb && phb->dma_dev_setup) + phb->dma_dev_setup(phb, pdev); + else + pnv_pci_dma_fallback_setup(hose, pdev); +} + +/* Fixup wrong class code in p7ioc root complex */ +static void __devinit pnv_p7ioc_rc_quirk(struct pci_dev *dev) +{ + dev->class = PCI_CLASS_BRIDGE_PCI << 8; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk); + +static int pnv_pci_probe_mode(struct pci_bus *bus) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + const __be64 *tstamp; + u64 now, target; + + + /* We hijack this as a way to ensure we have waited long + * enough since the reset was lifted on the PCI bus + */ + if (bus != hose->bus) + return PCI_PROBE_NORMAL; + tstamp = of_get_property(hose->dn, "reset-clear-timestamp", NULL); + if (!tstamp || !*tstamp) + return PCI_PROBE_NORMAL; + + now = mftb() / tb_ticks_per_usec; + target = (be64_to_cpup(tstamp) / tb_ticks_per_usec) + + PCI_RESET_DELAY_US; + + pr_devel("pci %04d: Reset target: 0x%llx now: 0x%llx\n", + hose->global_number, target, now); + + if (now < target) + msleep((target - now + 999) / 1000); + + return PCI_PROBE_NORMAL; +} + +void __init pnv_pci_init(void) +{ + struct device_node *np; + + pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN); + + /* OPAL absent, try POPAL first then RTAS detection of PHBs */ + if (!firmware_has_feature(FW_FEATURE_OPAL)) { +#ifdef CONFIG_PPC_POWERNV_RTAS + init_pci_config_tokens(); + find_and_init_phbs(); +#endif /* CONFIG_PPC_POWERNV_RTAS */ + } + /* OPAL is here, do our normal stuff */ + else { + int found_ioda = 0; + + /* Look for IODA IO-Hubs. We don't support mixing IODA + * and p5ioc2 due to the need to change some global + * probing flags + */ + for_each_compatible_node(np, NULL, "ibm,ioda-hub") { + pnv_pci_init_ioda_hub(np); + found_ioda = 1; + } + + /* Look for p5ioc2 IO-Hubs */ + if (!found_ioda) + for_each_compatible_node(np, NULL, "ibm,p5ioc2") + pnv_pci_init_p5ioc2_hub(np); + } + + /* Setup the linkage between OF nodes and PHBs */ + pci_devs_phb_init(); + + /* Configure IOMMU DMA hooks */ + ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup; + ppc_md.tce_build = pnv_tce_build; + ppc_md.tce_free = pnv_tce_free; + ppc_md.pci_probe_mode = pnv_pci_probe_mode; + set_pci_dma_ops(&dma_iommu_ops); + + /* Configure MSIs */ +#ifdef CONFIG_PCI_MSI + ppc_md.msi_check_device = pnv_msi_check_device; + ppc_md.setup_msi_irqs = pnv_setup_msi_irqs; + ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; +#endif +} diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h new file mode 100644 index 00000000..8bc47963 --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci.h @@ -0,0 +1,148 @@ +#ifndef __POWERNV_PCI_H +#define __POWERNV_PCI_H + +struct pci_dn; + +enum pnv_phb_type { + PNV_PHB_P5IOC2, + PNV_PHB_IODA1, + PNV_PHB_IODA2, +}; + +/* Precise PHB model for error management */ +enum pnv_phb_model { + PNV_PHB_MODEL_UNKNOWN, + PNV_PHB_MODEL_P5IOC2, + PNV_PHB_MODEL_P7IOC, +}; + +#define PNV_PCI_DIAG_BUF_SIZE 4096 + +/* Data associated with a PE, including IOMMU tracking etc.. */ +struct pnv_ioda_pe { + /* A PE can be associated with a single device or an + * entire bus (& children). In the former case, pdev + * is populated, in the later case, pbus is. + */ + struct pci_dev *pdev; + struct pci_bus *pbus; + + /* Effective RID (device RID for a device PE and base bus + * RID with devfn 0 for a bus PE) + */ + unsigned int rid; + + /* PE number */ + unsigned int pe_number; + + /* "Weight" assigned to the PE for the sake of DMA resource + * allocations + */ + unsigned int dma_weight; + + /* This is a PCI-E -> PCI-X bridge, this points to the + * corresponding bus PE + */ + struct pnv_ioda_pe *bus_pe; + + /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ + int tce32_seg; + int tce32_segcount; + struct iommu_table tce32_table; + + /* XXX TODO: Add support for additional 64-bit iommus */ + + /* MSIs. MVE index is identical for for 32 and 64 bit MSI + * and -1 if not supported. (It's actually identical to the + * PE number) + */ + int mve_number; + + /* Link in list of PE#s */ + struct list_head link; +}; + +struct pnv_phb { + struct pci_controller *hose; + enum pnv_phb_type type; + enum pnv_phb_model model; + u64 opal_id; + void __iomem *regs; + spinlock_t lock; + +#ifdef CONFIG_PCI_MSI + unsigned long *msi_map; + unsigned int msi_base; + unsigned int msi_count; + unsigned int msi_next; + unsigned int msi32_support; +#endif + int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int is_64, + struct msi_msg *msg); + void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); + void (*fixup_phb)(struct pci_controller *hose); + u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); + + union { + struct { + struct iommu_table iommu_table; + } p5ioc2; + + struct { + /* Global bridge info */ + unsigned int total_pe; + unsigned int m32_size; + unsigned int m32_segsize; + unsigned int m32_pci_base; + unsigned int io_size; + unsigned int io_segsize; + unsigned int io_pci_base; + + /* PE allocation bitmap */ + unsigned long *pe_alloc; + + /* M32 & IO segment maps */ + unsigned int *m32_segmap; + unsigned int *io_segmap; + struct pnv_ioda_pe *pe_array; + + /* Reverse map of PEs, will have to extend if + * we are to support more than 256 PEs, indexed + * bus { bus, devfn } + */ + unsigned char pe_rmap[0x10000]; + + /* 32-bit TCE tables allocation */ + unsigned long tce32_count; + + /* Total "weight" for the sake of DMA resources + * allocation + */ + unsigned int dma_weight; + unsigned int dma_pe_count; + + /* Sorted list of used PE's, sorted at + * boot for resource allocation purposes + */ + struct list_head pe_list; + } ioda; + }; + + /* PHB status structure */ + union { + unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; + struct OpalIoP7IOCPhbErrorData p7ioc; + } diag; +}; + +extern struct pci_ops pnv_pci_ops; + +extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset); +extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); +extern void pnv_pci_init_ioda_hub(struct device_node *np); + + +#endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h new file mode 100644 index 00000000..8a9df7f9 --- /dev/null +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -0,0 +1,16 @@ +#ifndef _POWERNV_H +#define _POWERNV_H + +#ifdef CONFIG_SMP +extern void pnv_smp_init(void); +#else +static inline void pnv_smp_init(void) { } +#endif + +#ifdef CONFIG_PCI +extern void pnv_pci_init(void); +#else +static inline void pnv_pci_init(void) { } +#endif + +#endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c new file mode 100644 index 00000000..db1ad1c8 --- /dev/null +++ b/arch/powerpc/platforms/powernv/setup.c @@ -0,0 +1,195 @@ +/* + * PowerNV setup code. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/tty.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/seq_file.h> +#include <linux/of.h> +#include <linux/interrupt.h> +#include <linux/bug.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/xics.h> +#include <asm/rtas.h> +#include <asm/opal.h> + +#include "powernv.h" + +static void __init pnv_setup_arch(void) +{ + /* Initialize SMP */ + pnv_smp_init(); + + /* Setup PCI */ + pnv_pci_init(); + + /* Setup RTC and NVRAM callbacks */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + opal_nvram_init(); + + /* Enable NAP mode */ + powersave_nap = 1; + + /* XXX PMCS */ +} + +static void __init pnv_init_early(void) +{ +#ifdef CONFIG_HVC_OPAL + if (firmware_has_feature(FW_FEATURE_OPAL)) + hvc_opal_init_early(); + else +#endif + add_preferred_console("hvc", 0, NULL); +} + +static void __init pnv_init_IRQ(void) +{ + xics_init(); + + WARN_ON(!ppc_md.get_irq); +} + +static void pnv_show_cpuinfo(struct seq_file *m) +{ + struct device_node *root; + const char *model = ""; + + root = of_find_node_by_path("/"); + if (root) + model = of_get_property(root, "model", NULL); + seq_printf(m, "machine\t\t: PowerNV %s\n", model); + if (firmware_has_feature(FW_FEATURE_OPALv2)) + seq_printf(m, "firmware\t: OPAL v2\n"); + else if (firmware_has_feature(FW_FEATURE_OPAL)) + seq_printf(m, "firmware\t: OPAL v1\n"); + else + seq_printf(m, "firmware\t: BML\n"); + of_node_put(root); +} + +static void __noreturn pnv_restart(char *cmd) +{ + long rc = OPAL_BUSY; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_cec_reboot(); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + for (;;) + opal_poll_events(NULL); +} + +static void __noreturn pnv_power_off(void) +{ + long rc = OPAL_BUSY; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_cec_power_down(0); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + for (;;) + opal_poll_events(NULL); +} + +static void __noreturn pnv_halt(void) +{ + pnv_power_off(); +} + +static void pnv_progress(char *s, unsigned short hex) +{ +} + +#ifdef CONFIG_KEXEC +static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) +{ + xics_kexec_teardown_cpu(secondary); +} +#endif /* CONFIG_KEXEC */ + +static void __init pnv_setup_machdep_opal(void) +{ + ppc_md.get_boot_time = opal_get_boot_time; + ppc_md.get_rtc_time = opal_get_rtc_time; + ppc_md.set_rtc_time = opal_set_rtc_time; + ppc_md.restart = pnv_restart; + ppc_md.power_off = pnv_power_off; + ppc_md.halt = pnv_halt; + ppc_md.machine_check_exception = opal_machine_check; +} + +#ifdef CONFIG_PPC_POWERNV_RTAS +static void __init pnv_setup_machdep_rtas(void) +{ + if (rtas_token("get-time-of-day") != RTAS_UNKNOWN_SERVICE) { + ppc_md.get_boot_time = rtas_get_boot_time; + ppc_md.get_rtc_time = rtas_get_rtc_time; + ppc_md.set_rtc_time = rtas_set_rtc_time; + } + ppc_md.restart = rtas_restart; + ppc_md.power_off = rtas_power_off; + ppc_md.halt = rtas_halt; +} +#endif /* CONFIG_PPC_POWERNV_RTAS */ + +static int __init pnv_probe(void) +{ + unsigned long root = of_get_flat_dt_root(); + + if (!of_flat_dt_is_compatible(root, "ibm,powernv")) + return 0; + + hpte_init_native(); + + if (firmware_has_feature(FW_FEATURE_OPAL)) + pnv_setup_machdep_opal(); +#ifdef CONFIG_PPC_POWERNV_RTAS + else if (rtas.base) + pnv_setup_machdep_rtas(); +#endif /* CONFIG_PPC_POWERNV_RTAS */ + + pr_debug("PowerNV detected !\n"); + + return 1; +} + +define_machine(powernv) { + .name = "PowerNV", + .probe = pnv_probe, + .init_early = pnv_init_early, + .setup_arch = pnv_setup_arch, + .init_IRQ = pnv_init_IRQ, + .show_cpuinfo = pnv_show_cpuinfo, + .progress = pnv_progress, + .power_save = power7_idle, + .calibrate_decr = generic_calibrate_decr, +#ifdef CONFIG_KEXEC + .kexec_cpu_down = pnv_kexec_cpu_down, +#endif +}; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c new file mode 100644 index 00000000..3ef46254 --- /dev/null +++ b/arch/powerpc/platforms/powernv/smp.c @@ -0,0 +1,181 @@ +/* + * SMP support for PowerNV machines. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> + +#include <asm/irq.h> +#include <asm/smp.h> +#include <asm/paca.h> +#include <asm/machdep.h> +#include <asm/cputable.h> +#include <asm/firmware.h> +#include <asm/rtas.h> +#include <asm/vdso_datapage.h> +#include <asm/cputhreads.h> +#include <asm/xics.h> +#include <asm/opal.h> + +#include "powernv.h" + +#ifdef DEBUG +#include <asm/udbg.h> +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +static void __cpuinit pnv_smp_setup_cpu(int cpu) +{ + if (cpu != boot_cpuid) + xics_setup_cpu(); +} + +static int pnv_smp_cpu_bootable(unsigned int nr) +{ + /* Special case - we inhibit secondary thread startup + * during boot if the user requests it. + */ + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { + if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) + return 0; + if (smt_enabled_at_boot + && cpu_thread_in_core(nr) >= smt_enabled_at_boot) + return 0; + } + + return 1; +} + +int __devinit pnv_smp_kick_cpu(int nr) +{ + unsigned int pcpu = get_hard_smp_processor_id(nr); + unsigned long start_here = __pa(*((unsigned long *) + generic_secondary_smp_init)); + long rc; + + BUG_ON(nr < 0 || nr >= NR_CPUS); + + /* On OPAL v2 the CPU are still spinning inside OPAL itself, + * get them back now + */ + if (!paca[nr].cpu_start && firmware_has_feature(FW_FEATURE_OPALv2)) { + pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu); + rc = opal_start_cpu(pcpu, start_here); + if (rc != OPAL_SUCCESS) + pr_warn("OPAL Error %ld starting CPU %d\n", + rc, nr); + } + return smp_generic_kick_cpu(nr); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static int pnv_smp_cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* This is identical to pSeries... might consolidate by + * moving migrate_irqs_away to a ppc_md with default to + * the generic fixup_irqs. --BenH. + */ + set_cpu_online(cpu, false); + vdso_data->processorCount--; + if (cpu == boot_cpuid) + boot_cpuid = cpumask_any(cpu_online_mask); + xics_migrate_irqs_away(); + return 0; +} + +static void pnv_smp_cpu_kill_self(void) +{ + unsigned int cpu; + + /* If powersave_nap is enabled, use NAP mode, else just + * spin aimlessly + */ + if (!powersave_nap) { + generic_mach_cpu_die(); + return; + } + + /* Standard hot unplug procedure */ + local_irq_disable(); + idle_task_exit(); + current->active_mm = NULL; /* for sanity */ + cpu = smp_processor_id(); + DBG("CPU%d offline\n", cpu); + generic_set_cpu_dead(cpu); + smp_wmb(); + + /* We don't want to take decrementer interrupts while we are offline, + * so clear LPCR:PECE1. We keep PECE2 enabled. + */ + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); + while (!generic_check_cpu_restart(cpu)) { + power7_idle(); + if (!generic_check_cpu_restart(cpu)) { + DBG("CPU%d Unexpected exit while offline !\n", cpu); + /* We may be getting an IPI, so we re-enable + * interrupts to process it, it will be ignored + * since we aren't online (hopefully) + */ + local_irq_enable(); + local_irq_disable(); + } + } + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); + DBG("CPU%d coming online...\n", cpu); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static struct smp_ops_t pnv_smp_ops = { + .message_pass = smp_muxed_ipi_message_pass, + .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ + .probe = xics_smp_probe, + .kick_cpu = pnv_smp_kick_cpu, + .setup_cpu = pnv_smp_setup_cpu, + .cpu_bootable = pnv_smp_cpu_bootable, +#ifdef CONFIG_HOTPLUG_CPU + .cpu_disable = pnv_smp_cpu_disable, + .cpu_die = generic_cpu_die, +#endif /* CONFIG_HOTPLUG_CPU */ +}; + +/* This is called very early during platform setup_arch */ +void __init pnv_smp_init(void) +{ + smp_ops = &pnv_smp_ops; + + /* XXX We don't yet have a proper entry point from HAL, for + * now we rely on kexec-style entry from BML + */ + +#ifdef CONFIG_PPC_RTAS + /* Non-lpar has additional take/give timebase */ + if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { + smp_ops->give_timebase = rtas_give_timebase; + smp_ops->take_timebase = rtas_take_timebase; + } +#endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_HOTPLUG_CPU + ppc_md.cpu_die = pnv_smp_cpu_kill_self; +#endif +} |