summaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig16
-rw-r--r--arch/powerpc/platforms/powernv/Makefile5
-rw-r--r--arch/powerpc/platforms/powernv/opal-nvram.c88
-rw-r--r--arch/powerpc/platforms/powernv/opal-rtc.c97
-rw-r--r--arch/powerpc/platforms/powernv/opal-takeover.S140
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S109
-rw-r--r--arch/powerpc/platforms/powernv/opal.c322
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c1340
-rw-r--r--arch/powerpc/platforms/powernv/pci-p5ioc2.c235
-rw-r--r--arch/powerpc/platforms/powernv/pci.c609
-rw-r--r--arch/powerpc/platforms/powernv/pci.h148
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h16
-rw-r--r--arch/powerpc/platforms/powernv/setup.c195
-rw-r--r--arch/powerpc/platforms/powernv/smp.c181
14 files changed, 3501 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
new file mode 100644
index 00000000..74fea5c2
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -0,0 +1,16 @@
+config PPC_POWERNV
+ depends on PPC64 && PPC_BOOK3S
+ bool "IBM PowerNV (Non-Virtualized) platform support"
+ select PPC_NATIVE
+ select PPC_XICS
+ select PPC_ICP_NATIVE
+ select PPC_P7_NAP
+ select PPC_PCI_CHOICE if EMBEDDED
+ default y
+
+config PPC_POWERNV_RTAS
+ depends on PPC_POWERNV
+ bool "Support for RTAS based PowerNV platforms such as BML"
+ default y
+ select PPC_ICS_RTAS
+ select PPC_RTAS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
new file mode 100644
index 00000000..bcc3cb48
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -0,0 +1,5 @@
+obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o
+obj-y += opal-rtc.o opal-nvram.o
+
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c
new file mode 100644
index 00000000..3f83e1ae
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -0,0 +1,88 @@
+/*
+ * PowerNV nvram code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+
+static ssize_t opal_nvram_size(void)
+{
+ return nvram_size;
+}
+
+static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index)
+{
+ s64 rc;
+ int off;
+
+ if (*index >= nvram_size)
+ return 0;
+ off = *index;
+ if ((off + count) > nvram_size)
+ count = nvram_size - off;
+ rc = opal_read_nvram(__pa(buf), count, off);
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+ *index += count;
+ return count;
+}
+
+static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index)
+{
+ s64 rc = OPAL_BUSY;
+ int off;
+
+ if (*index >= nvram_size)
+ return 0;
+ off = *index;
+ if ((off + count) > nvram_size)
+ count = nvram_size - off;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_write_nvram(__pa(buf), count, off);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ }
+ *index += count;
+ return count;
+}
+
+void __init opal_nvram_init(void)
+{
+ struct device_node *np;
+ const u32 *nbytes_p;
+
+ np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram");
+ if (np == NULL)
+ return;
+
+ nbytes_p = of_get_property(np, "#bytes", NULL);
+ if (!nbytes_p) {
+ of_node_put(np);
+ return;
+ }
+ nvram_size = *nbytes_p;
+
+ printk(KERN_INFO "OPAL nvram setup, %u bytes\n", nvram_size);
+ of_node_put(np);
+
+ ppc_md.nvram_read = opal_nvram_read;
+ ppc_md.nvram_write = opal_nvram_write;
+ ppc_md.nvram_size = opal_nvram_size;
+}
+
diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
new file mode 100644
index 00000000..2aa7641a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -0,0 +1,97 @@
+/*
+ * PowerNV Real Time Clock.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+#include <asm/firmware.h>
+
+static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
+{
+ tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) +
+ bcd2bin((y_m_d >> 16) & 0xff)) - 1900;
+ tm->tm_mon = bcd2bin((y_m_d >> 8) & 0xff) - 1;
+ tm->tm_mday = bcd2bin(y_m_d & 0xff);
+ tm->tm_hour = bcd2bin((h_m_s_ms >> 56) & 0xff);
+ tm->tm_min = bcd2bin((h_m_s_ms >> 48) & 0xff);
+ tm->tm_sec = bcd2bin((h_m_s_ms >> 40) & 0xff);
+
+ GregorianDay(tm);
+}
+
+unsigned long __init opal_get_boot_time(void)
+{
+ struct rtc_time tm;
+ u32 y_m_d;
+ u64 h_m_s_ms;
+ long rc = OPAL_BUSY;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_rtc_read(&y_m_d, &h_m_s_ms);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ if (rc != OPAL_SUCCESS)
+ return 0;
+ opal_to_tm(y_m_d, h_m_s_ms, &tm);
+ return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
+}
+
+void opal_get_rtc_time(struct rtc_time *tm)
+{
+ long rc = OPAL_BUSY;
+ u32 y_m_d;
+ u64 h_m_s_ms;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_rtc_read(&y_m_d, &h_m_s_ms);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ if (rc != OPAL_SUCCESS)
+ return;
+ opal_to_tm(y_m_d, h_m_s_ms, tm);
+}
+
+int opal_set_rtc_time(struct rtc_time *tm)
+{
+ long rc = OPAL_BUSY;
+ u32 y_m_d = 0;
+ u64 h_m_s_ms = 0;
+
+ y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) / 100)) << 24;
+ y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) % 100)) << 16;
+ y_m_d |= ((u32)bin2bcd((tm->tm_mon + 1))) << 8;
+ y_m_d |= ((u32)bin2bcd(tm->tm_mday));
+
+ h_m_s_ms |= ((u64)bin2bcd(tm->tm_hour)) << 56;
+ h_m_s_ms |= ((u64)bin2bcd(tm->tm_min)) << 48;
+ h_m_s_ms |= ((u64)bin2bcd(tm->tm_sec)) << 40;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_rtc_write(y_m_d, h_m_s_ms);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ return rc == OPAL_SUCCESS ? 0 : -EIO;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S
new file mode 100644
index 00000000..77b48b2b
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-takeover.S
@@ -0,0 +1,140 @@
+/*
+ * PowerNV OPAL takeover assembly code, for use by prom_init.c
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/opal.h>
+
+#define STK_PARAM(i) (48 + ((i)-3)*8)
+
+#define H_HAL_TAKEOVER 0x5124
+#define H_HAL_TAKEOVER_QUERY_MAGIC -1
+
+ .text
+_GLOBAL(opal_query_takeover)
+ mfcr r0
+ stw r0,8(r1)
+ std r3,STK_PARAM(r3)(r1)
+ std r4,STK_PARAM(r4)(r1)
+ li r3,H_HAL_TAKEOVER
+ li r4,H_HAL_TAKEOVER_QUERY_MAGIC
+ HVSC
+ ld r10,STK_PARAM(r3)(r1)
+ std r4,0(r10)
+ ld r10,STK_PARAM(r4)(r1)
+ std r5,0(r10)
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+ blr
+
+_GLOBAL(opal_do_takeover)
+ mfcr r0
+ stw r0,8(r1)
+ mflr r0
+ std r0,16(r1)
+ bl __opal_do_takeover
+ ld r0,16(r1)
+ mtlr r0
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+ blr
+
+__opal_do_takeover:
+ ld r4,0(r3)
+ ld r5,0x8(r3)
+ ld r6,0x10(r3)
+ ld r7,0x18(r3)
+ ld r8,0x20(r3)
+ ld r9,0x28(r3)
+ ld r10,0x30(r3)
+ ld r11,0x38(r3)
+ li r3,H_HAL_TAKEOVER
+ HVSC
+ blr
+
+ .globl opal_secondary_entry
+opal_secondary_entry:
+ mr r31,r3
+ mfmsr r11
+ li r12,(MSR_SF | MSR_ISF)@highest
+ sldi r12,r12,48
+ or r11,r11,r12
+ mtmsrd r11
+ isync
+ mfspr r4,SPRN_PIR
+ std r4,0(r3)
+1: HMT_LOW
+ ld r4,8(r3)
+ cmpli cr0,r4,0
+ beq 1b
+ HMT_MEDIUM
+1: addi r3,r31,16
+ bl __opal_do_takeover
+ b 1b
+
+_GLOBAL(opal_enter_rtas)
+ mflr r0
+ std r0,16(r1)
+ stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */
+
+ /* Because PROM is running in 32b mode, it clobbers the high order half
+ * of all registers that it saves. We therefore save those registers
+ * PROM might touch to the stack. (r0, r3-r13 are caller saved)
+ */
+ SAVE_GPR(2, r1)
+ SAVE_GPR(13, r1)
+ SAVE_8GPRS(14, r1)
+ SAVE_10GPRS(22, r1)
+ mfcr r10
+ mfmsr r11
+ std r10,_CCR(r1)
+ std r11,_MSR(r1)
+
+ /* Get the PROM entrypoint */
+ mtlr r5
+
+ /* Switch MSR to 32 bits mode
+ */
+ li r12,1
+ rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG)
+ andc r11,r11,r12
+ li r12,1
+ rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG)
+ andc r11,r11,r12
+ mtmsrd r11
+ isync
+
+ /* Enter RTAS here... */
+ blrl
+
+ /* Just make sure that r1 top 32 bits didn't get
+ * corrupt by OF
+ */
+ rldicl r1,r1,0,32
+
+ /* Restore the MSR (back to 64 bits) */
+ ld r0,_MSR(r1)
+ MTMSRD(r0)
+ isync
+
+ /* Restore other registers */
+ REST_GPR(2, r1)
+ REST_GPR(13, r1)
+ REST_8GPRS(14, r1)
+ REST_10GPRS(22, r1)
+ ld r4,_CCR(r1)
+ mtcr r4
+
+ addi r1,r1,PROM_FRAME_SIZE
+ ld r0,16(r1)
+ mtlr r0
+ blr
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
new file mode 100644
index 00000000..3bb07e5e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -0,0 +1,109 @@
+/*
+ * PowerNV OPAL API wrappers
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/opal.h>
+
+/* TODO:
+ *
+ * - Trace irqs in/off (needs saving/restoring all args, argh...)
+ * - Get r11 feed up by Dave so I can have better register usage
+ */
+#define OPAL_CALL(name, token) \
+ _GLOBAL(name); \
+ mflr r0; \
+ mfcr r12; \
+ std r0,16(r1); \
+ std r12,8(r1); \
+ std r1,PACAR1(r13); \
+ li r0,0; \
+ mfmsr r12; \
+ ori r0,r0,MSR_EE; \
+ std r12,PACASAVEDMSR(r13); \
+ andc r12,r12,r0; \
+ mtmsrd r12,1; \
+ LOAD_REG_ADDR(r0,.opal_return); \
+ mtlr r0; \
+ li r0,MSR_DR|MSR_IR; \
+ andc r12,r12,r0; \
+ li r0,token; \
+ mtspr SPRN_HSRR1,r12; \
+ LOAD_REG_ADDR(r11,opal); \
+ ld r12,8(r11); \
+ ld r2,0(r11); \
+ mtspr SPRN_HSRR0,r12; \
+ hrfid
+
+_STATIC(opal_return)
+ ld r2,PACATOC(r13);
+ ld r4,8(r1);
+ ld r5,16(r1);
+ ld r6,PACASAVEDMSR(r13);
+ mtspr SPRN_SRR0,r5;
+ mtspr SPRN_SRR1,r6;
+ mtcr r4;
+ rfid
+
+OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
+OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
+OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
+OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
new file mode 100644
index 00000000..aaa0dba4
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -0,0 +1,322 @@
+/*
+ * PowerNV OPAL high level interfaces
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/interrupt.h>
+#include <asm/opal.h>
+#include <asm/firmware.h>
+
+#include "powernv.h"
+
+struct opal {
+ u64 base;
+ u64 entry;
+} opal;
+
+static struct device_node *opal_node;
+static DEFINE_SPINLOCK(opal_write_lock);
+extern u64 opal_mc_secondary_handler[];
+
+int __init early_init_dt_scan_opal(unsigned long node,
+ const char *uname, int depth, void *data)
+{
+ const void *basep, *entryp;
+ unsigned long basesz, entrysz;
+ u64 glue;
+
+ if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+ return 0;
+
+ basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
+ entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
+
+ if (!basep || !entryp)
+ return 1;
+
+ opal.base = of_read_number(basep, basesz/4);
+ opal.entry = of_read_number(entryp, entrysz/4);
+
+ pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%ld)\n",
+ opal.base, basep, basesz);
+ pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%ld)\n",
+ opal.entry, entryp, entrysz);
+
+ powerpc_firmware_features |= FW_FEATURE_OPAL;
+ if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) {
+ powerpc_firmware_features |= FW_FEATURE_OPALv2;
+ printk("OPAL V2 detected !\n");
+ } else {
+ printk("OPAL V1 detected !\n");
+ }
+
+ /* Hookup some exception handlers. We use the fwnmi area at 0x7000
+ * to provide the glue space to OPAL
+ */
+ glue = 0x7000;
+ opal_register_exception_handler(OPAL_MACHINE_CHECK_HANDLER,
+ __pa(opal_mc_secondary_handler[0]),
+ glue);
+ glue += 128;
+ opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER,
+ 0, glue);
+ glue += 128;
+ opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
+
+ return 1;
+}
+
+int opal_get_chars(uint32_t vtermno, char *buf, int count)
+{
+ s64 len, rc;
+ u64 evt;
+
+ if (!opal.entry)
+ return -ENODEV;
+ opal_poll_events(&evt);
+ if ((evt & OPAL_EVENT_CONSOLE_INPUT) == 0)
+ return 0;
+ len = count;
+ rc = opal_console_read(vtermno, &len, buf);
+ if (rc == OPAL_SUCCESS)
+ return len;
+ return 0;
+}
+
+int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+{
+ int written = 0;
+ s64 len, rc;
+ unsigned long flags;
+ u64 evt;
+
+ if (!opal.entry)
+ return -ENODEV;
+
+ /* We want put_chars to be atomic to avoid mangling of hvsi
+ * packets. To do that, we first test for room and return
+ * -EAGAIN if there isn't enough.
+ *
+ * Unfortunately, opal_console_write_buffer_space() doesn't
+ * appear to work on opal v1, so we just assume there is
+ * enough room and be done with it
+ */
+ spin_lock_irqsave(&opal_write_lock, flags);
+ if (firmware_has_feature(FW_FEATURE_OPALv2)) {
+ rc = opal_console_write_buffer_space(vtermno, &len);
+ if (rc || len < total_len) {
+ spin_unlock_irqrestore(&opal_write_lock, flags);
+ /* Closed -> drop characters */
+ if (rc)
+ return total_len;
+ opal_poll_events(&evt);
+ return -EAGAIN;
+ }
+ }
+
+ /* We still try to handle partial completions, though they
+ * should no longer happen.
+ */
+ rc = OPAL_BUSY;
+ while(total_len > 0 && (rc == OPAL_BUSY ||
+ rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) {
+ len = total_len;
+ rc = opal_console_write(vtermno, &len, data);
+ if (rc == OPAL_SUCCESS) {
+ total_len -= len;
+ data += len;
+ written += len;
+ }
+ /* This is a bit nasty but we need that for the console to
+ * flush when there aren't any interrupts. We will clean
+ * things a bit later to limit that to synchronous path
+ * such as the kernel console and xmon/udbg
+ */
+ do
+ opal_poll_events(&evt);
+ while(rc == OPAL_SUCCESS && (evt & OPAL_EVENT_CONSOLE_OUTPUT));
+ }
+ spin_unlock_irqrestore(&opal_write_lock, flags);
+ return written;
+}
+
+int opal_machine_check(struct pt_regs *regs)
+{
+ struct opal_machine_check_event *opal_evt = get_paca()->opal_mc_evt;
+ struct opal_machine_check_event evt;
+ const char *level, *sevstr, *subtype;
+ static const char *opal_mc_ue_types[] = {
+ "Indeterminate",
+ "Instruction fetch",
+ "Page table walk ifetch",
+ "Load/Store",
+ "Page table walk Load/Store",
+ };
+ static const char *opal_mc_slb_types[] = {
+ "Indeterminate",
+ "Parity",
+ "Multihit",
+ };
+ static const char *opal_mc_erat_types[] = {
+ "Indeterminate",
+ "Parity",
+ "Multihit",
+ };
+ static const char *opal_mc_tlb_types[] = {
+ "Indeterminate",
+ "Parity",
+ "Multihit",
+ };
+
+ /* Copy the event structure and release the original */
+ evt = *opal_evt;
+ opal_evt->in_use = 0;
+
+ /* Print things out */
+ if (evt.version != OpalMCE_V1) {
+ pr_err("Machine Check Exception, Unknown event version %d !\n",
+ evt.version);
+ return 0;
+ }
+ switch(evt.severity) {
+ case OpalMCE_SEV_NO_ERROR:
+ level = KERN_INFO;
+ sevstr = "Harmless";
+ break;
+ case OpalMCE_SEV_WARNING:
+ level = KERN_WARNING;
+ sevstr = "";
+ break;
+ case OpalMCE_SEV_ERROR_SYNC:
+ level = KERN_ERR;
+ sevstr = "Severe";
+ break;
+ case OpalMCE_SEV_FATAL:
+ default:
+ level = KERN_ERR;
+ sevstr = "Fatal";
+ break;
+ }
+
+ printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+ evt.disposition == OpalMCE_DISPOSITION_RECOVERED ?
+ "Recovered" : "[Not recovered");
+ printk("%s Initiator: %s\n", level,
+ evt.initiator == OpalMCE_INITIATOR_CPU ? "CPU" : "Unknown");
+ switch(evt.error_type) {
+ case OpalMCE_ERROR_TYPE_UE:
+ subtype = evt.u.ue_error.ue_error_type <
+ ARRAY_SIZE(opal_mc_ue_types) ?
+ opal_mc_ue_types[evt.u.ue_error.ue_error_type]
+ : "Unknown";
+ printk("%s Error type: UE [%s]\n", level, subtype);
+ if (evt.u.ue_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt.u.ue_error.effective_address);
+ if (evt.u.ue_error.physical_address_provided)
+ printk("%s Physial address: %016llx\n",
+ level, evt.u.ue_error.physical_address);
+ break;
+ case OpalMCE_ERROR_TYPE_SLB:
+ subtype = evt.u.slb_error.slb_error_type <
+ ARRAY_SIZE(opal_mc_slb_types) ?
+ opal_mc_slb_types[evt.u.slb_error.slb_error_type]
+ : "Unknown";
+ printk("%s Error type: SLB [%s]\n", level, subtype);
+ if (evt.u.slb_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt.u.slb_error.effective_address);
+ break;
+ case OpalMCE_ERROR_TYPE_ERAT:
+ subtype = evt.u.erat_error.erat_error_type <
+ ARRAY_SIZE(opal_mc_erat_types) ?
+ opal_mc_erat_types[evt.u.erat_error.erat_error_type]
+ : "Unknown";
+ printk("%s Error type: ERAT [%s]\n", level, subtype);
+ if (evt.u.erat_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt.u.erat_error.effective_address);
+ break;
+ case OpalMCE_ERROR_TYPE_TLB:
+ subtype = evt.u.tlb_error.tlb_error_type <
+ ARRAY_SIZE(opal_mc_tlb_types) ?
+ opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type]
+ : "Unknown";
+ printk("%s Error type: TLB [%s]\n", level, subtype);
+ if (evt.u.tlb_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt.u.tlb_error.effective_address);
+ break;
+ default:
+ case OpalMCE_ERROR_TYPE_UNKNOWN:
+ printk("%s Error type: Unknown\n", level);
+ break;
+ }
+ return evt.severity == OpalMCE_SEV_FATAL ? 0 : 1;
+}
+
+static irqreturn_t opal_interrupt(int irq, void *data)
+{
+ uint64_t events;
+
+ opal_handle_interrupt(virq_to_hw(irq), &events);
+
+ /* XXX TODO: Do something with the events */
+
+ return IRQ_HANDLED;
+}
+
+static int __init opal_init(void)
+{
+ struct device_node *np, *consoles;
+ const u32 *irqs;
+ int rc, i, irqlen;
+
+ opal_node = of_find_node_by_path("/ibm,opal");
+ if (!opal_node) {
+ pr_warn("opal: Node not found\n");
+ return -ENODEV;
+ }
+ if (firmware_has_feature(FW_FEATURE_OPALv2))
+ consoles = of_find_node_by_path("/ibm,opal/consoles");
+ else
+ consoles = of_node_get(opal_node);
+
+ /* Register serial ports */
+ for_each_child_of_node(consoles, np) {
+ if (strcmp(np->name, "serial"))
+ continue;
+ of_platform_device_create(np, NULL, NULL);
+ }
+ of_node_put(consoles);
+
+ /* Find all OPAL interrupts and request them */
+ irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
+ pr_debug("opal: Found %d interrupts reserved for OPAL\n",
+ irqs ? (irqlen / 4) : 0);
+ for (i = 0; irqs && i < (irqlen / 4); i++, irqs++) {
+ unsigned int hwirq = be32_to_cpup(irqs);
+ unsigned int irq = irq_create_mapping(NULL, hwirq);
+ if (irq == NO_IRQ) {
+ pr_warning("opal: Failed to map irq 0x%x\n", hwirq);
+ continue;
+ }
+ rc = request_irq(irq, opal_interrupt, 0, "opal", NULL);
+ if (rc)
+ pr_warning("opal: Error %d requesting irq %d"
+ " (0x%x)\n", rc, irq, hwirq);
+ }
+ return 0;
+}
+subsys_initcall(opal_init);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
new file mode 100644
index 00000000..fbdd74da
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -0,0 +1,1340 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/abs_addr.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+struct resource_wrap {
+ struct list_head link;
+ resource_size_t size;
+ resource_size_t align;
+ struct pci_dev *dev; /* Set if it's a device */
+ struct pci_bus *bus; /* Set if it's a bridge */
+};
+
+static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe,
+ struct va_format *vaf)
+{
+ char pfix[32];
+
+ if (pe->pdev)
+ strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
+ else
+ sprintf(pfix, "%04x:%02x ",
+ pci_domain_nr(pe->pbus), pe->pbus->number);
+ return printk("pci %s%s: [PE# %.3d] %pV", level, pfix, pe->pe_number, vaf);
+}
+
+#define define_pe_printk_level(func, kern_level) \
+static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ int r; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ r = __pe_printk(kern_level, pe, &vaf); \
+ va_end(args); \
+ \
+ return r; \
+} \
+
+define_pe_printk_level(pe_err, KERN_ERR);
+define_pe_printk_level(pe_warn, KERN_WARNING);
+define_pe_printk_level(pe_info, KERN_INFO);
+
+
+/* Calculate resource usage & alignment requirement of a single
+ * device. This will also assign all resources within the device
+ * for a given type starting at 0 for the biggest one and then
+ * assigning in decreasing order of size.
+ */
+static void __devinit pnv_ioda_calc_dev(struct pci_dev *dev, unsigned int flags,
+ resource_size_t *size,
+ resource_size_t *align)
+{
+ resource_size_t start;
+ struct resource *r;
+ int i;
+
+ pr_devel(" -> CDR %s\n", pci_name(dev));
+
+ *size = *align = 0;
+
+ /* Clear the resources out and mark them all unset */
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+ r = &dev->resource[i];
+ if (!(r->flags & flags))
+ continue;
+ if (r->start) {
+ r->end -= r->start;
+ r->start = 0;
+ }
+ r->flags |= IORESOURCE_UNSET;
+ }
+
+ /* We currently keep all memory resources together, we
+ * will handle prefetch & 64-bit separately in the future
+ * but for now we stick everybody in M32
+ */
+ start = 0;
+ for (;;) {
+ resource_size_t max_size = 0;
+ int max_no = -1;
+
+ /* Find next biggest resource */
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+ r = &dev->resource[i];
+ if (!(r->flags & IORESOURCE_UNSET) ||
+ !(r->flags & flags))
+ continue;
+ if (resource_size(r) > max_size) {
+ max_size = resource_size(r);
+ max_no = i;
+ }
+ }
+ if (max_no < 0)
+ break;
+ r = &dev->resource[max_no];
+ if (max_size > *align)
+ *align = max_size;
+ *size += max_size;
+ r->start = start;
+ start += max_size;
+ r->end = r->start + max_size - 1;
+ r->flags &= ~IORESOURCE_UNSET;
+ pr_devel(" -> R%d %016llx..%016llx\n",
+ max_no, r->start, r->end);
+ }
+ pr_devel(" <- CDR %s size=%llx align=%llx\n",
+ pci_name(dev), *size, *align);
+}
+
+/* Allocate a resource "wrap" for a given device or bridge and
+ * insert it at the right position in the sorted list
+ */
+static void __devinit pnv_ioda_add_wrap(struct list_head *list,
+ struct pci_bus *bus,
+ struct pci_dev *dev,
+ resource_size_t size,
+ resource_size_t align)
+{
+ struct resource_wrap *w1, *w = kzalloc(sizeof(*w), GFP_KERNEL);
+
+ w->size = size;
+ w->align = align;
+ w->dev = dev;
+ w->bus = bus;
+
+ list_for_each_entry(w1, list, link) {
+ if (w1->align < align) {
+ list_add_tail(&w->link, &w1->link);
+ return;
+ }
+ }
+ list_add_tail(&w->link, list);
+}
+
+/* Offset device resources of a given type */
+static void __devinit pnv_ioda_offset_dev(struct pci_dev *dev,
+ unsigned int flags,
+ resource_size_t offset)
+{
+ struct resource *r;
+ int i;
+
+ pr_devel(" -> ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset);
+
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+ r = &dev->resource[i];
+ if (r->flags & flags) {
+ dev->resource[i].start += offset;
+ dev->resource[i].end += offset;
+ }
+ }
+
+ pr_devel(" <- ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset);
+}
+
+/* Offset bus resources (& all children) of a given type */
+static void __devinit pnv_ioda_offset_bus(struct pci_bus *bus,
+ unsigned int flags,
+ resource_size_t offset)
+{
+ struct resource *r;
+ struct pci_dev *dev;
+ struct pci_bus *cbus;
+ int i;
+
+ pr_devel(" -> OBR %s [%x] +%016llx\n",
+ bus->self ? pci_name(bus->self) : "root", flags, offset);
+
+ pci_bus_for_each_resource(bus, r, i) {
+ if (r && (r->flags & flags)) {
+ r->start += offset;
+ r->end += offset;
+ }
+ }
+ list_for_each_entry(dev, &bus->devices, bus_list)
+ pnv_ioda_offset_dev(dev, flags, offset);
+ list_for_each_entry(cbus, &bus->children, node)
+ pnv_ioda_offset_bus(cbus, flags, offset);
+
+ pr_devel(" <- OBR %s [%x]\n",
+ bus->self ? pci_name(bus->self) : "root", flags);
+}
+
+/* This is the guts of our IODA resource allocation. This is called
+ * recursively for each bus in the system. It calculates all the
+ * necessary size and requirements for children and assign them
+ * resources such that:
+ *
+ * - Each function fits in it's own contiguous set of IO/M32
+ * segment
+ *
+ * - All segments behind a P2P bridge are contiguous and obey
+ * alignment constraints of those bridges
+ */
+static void __devinit pnv_ioda_calc_bus(struct pci_bus *bus, unsigned int flags,
+ resource_size_t *size,
+ resource_size_t *align)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ resource_size_t dev_size, dev_align, start;
+ resource_size_t min_align, min_balign;
+ struct pci_dev *cdev;
+ struct pci_bus *cbus;
+ struct list_head head;
+ struct resource_wrap *w;
+ unsigned int bres;
+
+ *size = *align = 0;
+
+ pr_devel("-> CBR %s [%x]\n",
+ bus->self ? pci_name(bus->self) : "root", flags);
+
+ /* Calculate alignment requirements based on the type
+ * of resource we are working on
+ */
+ if (flags & IORESOURCE_IO) {
+ bres = 0;
+ min_align = phb->ioda.io_segsize;
+ min_balign = 0x1000;
+ } else {
+ bres = 1;
+ min_align = phb->ioda.m32_segsize;
+ min_balign = 0x100000;
+ }
+
+ /* Gather all our children resources ordered by alignment */
+ INIT_LIST_HEAD(&head);
+
+ /* - Busses */
+ list_for_each_entry(cbus, &bus->children, node) {
+ pnv_ioda_calc_bus(cbus, flags, &dev_size, &dev_align);
+ pnv_ioda_add_wrap(&head, cbus, NULL, dev_size, dev_align);
+ }
+
+ /* - Devices */
+ list_for_each_entry(cdev, &bus->devices, bus_list) {
+ pnv_ioda_calc_dev(cdev, flags, &dev_size, &dev_align);
+ /* Align them to segment size */
+ if (dev_align < min_align)
+ dev_align = min_align;
+ pnv_ioda_add_wrap(&head, NULL, cdev, dev_size, dev_align);
+ }
+ if (list_empty(&head))
+ goto empty;
+
+ /* Now we can do two things: assign offsets to them within that
+ * level and get our total alignment & size requirements. The
+ * assignment algorithm is going to be uber-trivial for now, we
+ * can try to be smarter later at filling out holes.
+ */
+ if (bus->self) {
+ /* No offset for downstream bridges */
+ start = 0;
+ } else {
+ /* Offset from the root */
+ if (flags & IORESOURCE_IO)
+ /* Don't hand out IO 0 */
+ start = hose->io_resource.start + 0x1000;
+ else
+ start = hose->mem_resources[0].start;
+ }
+ while(!list_empty(&head)) {
+ w = list_first_entry(&head, struct resource_wrap, link);
+ list_del(&w->link);
+ if (w->size) {
+ if (start) {
+ start = ALIGN(start, w->align);
+ if (w->dev)
+ pnv_ioda_offset_dev(w->dev,flags,start);
+ else if (w->bus)
+ pnv_ioda_offset_bus(w->bus,flags,start);
+ }
+ if (w->align > *align)
+ *align = w->align;
+ }
+ start += w->size;
+ kfree(w);
+ }
+ *size = start;
+
+ /* Align and setup bridge resources */
+ *align = max_t(resource_size_t, *align,
+ max_t(resource_size_t, min_align, min_balign));
+ *size = ALIGN(*size,
+ max_t(resource_size_t, min_align, min_balign));
+ empty:
+ /* Only setup P2P's, not the PHB itself */
+ if (bus->self) {
+ struct resource *res = bus->resource[bres];
+
+ if (WARN_ON(res == NULL))
+ return;
+
+ /*
+ * FIXME: We should probably export and call
+ * pci_bridge_check_ranges() to properly re-initialize
+ * the PCI portion of the flags here, and to detect
+ * what the bridge actually supports.
+ */
+ res->start = 0;
+ res->flags = (*size) ? flags : 0;
+ res->end = (*size) ? (*size - 1) : 0;
+ }
+
+ pr_devel("<- CBR %s [%x] *size=%016llx *align=%016llx\n",
+ bus->self ? pci_name(bus->self) : "root", flags,*size,*align);
+}
+
+static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev)
+{
+ struct device_node *np;
+
+ np = pci_device_to_OF_node(dev);
+ if (!np)
+ return NULL;
+ return PCI_DN(np);
+}
+
+static void __devinit pnv_ioda_setup_pe_segments(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+ unsigned int pe, i;
+ resource_size_t pos;
+ struct resource io_res;
+ struct resource m32_res;
+ struct pci_bus_region region;
+ int rc;
+
+ /* Anything not referenced in the device-tree gets PE#0 */
+ pe = pdn ? pdn->pe_number : 0;
+
+ /* Calculate the device min/max */
+ io_res.start = m32_res.start = (resource_size_t)-1;
+ io_res.end = m32_res.end = 0;
+ io_res.flags = IORESOURCE_IO;
+ m32_res.flags = IORESOURCE_MEM;
+
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+ struct resource *r = NULL;
+ if (dev->resource[i].flags & IORESOURCE_IO)
+ r = &io_res;
+ if (dev->resource[i].flags & IORESOURCE_MEM)
+ r = &m32_res;
+ if (!r)
+ continue;
+ if (dev->resource[i].start < r->start)
+ r->start = dev->resource[i].start;
+ if (dev->resource[i].end > r->end)
+ r->end = dev->resource[i].end;
+ }
+
+ /* Setup IO segments */
+ if (io_res.start < io_res.end) {
+ pcibios_resource_to_bus(dev, &region, &io_res);
+ pos = region.start;
+ i = pos / phb->ioda.io_segsize;
+ while(i < phb->ioda.total_pe && pos <= region.end) {
+ if (phb->ioda.io_segmap[i]) {
+ pr_err("%s: Trying to use IO seg #%d which is"
+ " already used by PE# %d\n",
+ pci_name(dev), i,
+ phb->ioda.io_segmap[i]);
+ /* XXX DO SOMETHING TO DISABLE DEVICE ? */
+ break;
+ }
+ phb->ioda.io_segmap[i] = pe;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe,
+ OPAL_IO_WINDOW_TYPE,
+ 0, i);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: OPAL error %d setting up mapping"
+ " for IO seg# %d\n",
+ pci_name(dev), rc, i);
+ /* XXX DO SOMETHING TO DISABLE DEVICE ? */
+ break;
+ }
+ pos += phb->ioda.io_segsize;
+ i++;
+ };
+ }
+
+ /* Setup M32 segments */
+ if (m32_res.start < m32_res.end) {
+ pcibios_resource_to_bus(dev, &region, &m32_res);
+ pos = region.start;
+ i = pos / phb->ioda.m32_segsize;
+ while(i < phb->ioda.total_pe && pos <= region.end) {
+ if (phb->ioda.m32_segmap[i]) {
+ pr_err("%s: Trying to use M32 seg #%d which is"
+ " already used by PE# %d\n",
+ pci_name(dev), i,
+ phb->ioda.m32_segmap[i]);
+ /* XXX DO SOMETHING TO DISABLE DEVICE ? */
+ break;
+ }
+ phb->ioda.m32_segmap[i] = pe;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe,
+ OPAL_M32_WINDOW_TYPE,
+ 0, i);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: OPAL error %d setting up mapping"
+ " for M32 seg# %d\n",
+ pci_name(dev), rc, i);
+ /* XXX DO SOMETHING TO DISABLE DEVICE ? */
+ break;
+ }
+ pos += phb->ioda.m32_segsize;
+ i++;
+ }
+ }
+}
+
+/* Check if a resource still fits in the total IO or M32 range
+ * for a given PHB
+ */
+static int __devinit pnv_ioda_resource_fit(struct pci_controller *hose,
+ struct resource *r)
+{
+ struct resource *bounds;
+
+ if (r->flags & IORESOURCE_IO)
+ bounds = &hose->io_resource;
+ else if (r->flags & IORESOURCE_MEM)
+ bounds = &hose->mem_resources[0];
+ else
+ return 1;
+
+ if (r->start >= bounds->start && r->end <= bounds->end)
+ return 1;
+ r->flags = 0;
+ return 0;
+}
+
+static void __devinit pnv_ioda_update_resources(struct pci_bus *bus)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pci_bus *cbus;
+ struct pci_dev *cdev;
+ unsigned int i;
+
+ /* We used to clear all device enables here. However it looks like
+ * clearing MEM enable causes Obsidian (IPR SCS) to go bonkers,
+ * and shoot fatal errors to the PHB which in turns fences itself
+ * and we can't recover from that ... yet. So for now, let's leave
+ * the enables as-is and hope for the best.
+ */
+
+ /* Check if bus resources fit in our IO or M32 range */
+ for (i = 0; bus->self && (i < 2); i++) {
+ struct resource *r = bus->resource[i];
+ if (r && !pnv_ioda_resource_fit(hose, r))
+ pr_err("%s: Bus %d resource %d disabled, no room\n",
+ pci_name(bus->self), bus->number, i);
+ }
+
+ /* Update self if it's not a PHB */
+ if (bus->self)
+ pci_setup_bridge(bus);
+
+ /* Update child devices */
+ list_for_each_entry(cdev, &bus->devices, bus_list) {
+ /* Check if resource fits, if not, disabled it */
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+ struct resource *r = &cdev->resource[i];
+ if (!pnv_ioda_resource_fit(hose, r))
+ pr_err("%s: Resource %d disabled, no room\n",
+ pci_name(cdev), i);
+ }
+
+ /* Assign segments */
+ pnv_ioda_setup_pe_segments(cdev);
+
+ /* Update HW BARs */
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++)
+ pci_update_resource(cdev, i);
+ }
+
+ /* Update child busses */
+ list_for_each_entry(cbus, &bus->children, node)
+ pnv_ioda_update_resources(cbus);
+}
+
+static int __devinit pnv_ioda_alloc_pe(struct pnv_phb *phb)
+{
+ unsigned long pe;
+
+ do {
+ pe = find_next_zero_bit(phb->ioda.pe_alloc,
+ phb->ioda.total_pe, 0);
+ if (pe >= phb->ioda.total_pe)
+ return IODA_INVALID_PE;
+ } while(test_and_set_bit(pe, phb->ioda.pe_alloc));
+
+ phb->ioda.pe_array[pe].pe_number = pe;
+ return pe;
+}
+
+static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+{
+ WARN_ON(phb->ioda.pe_array[pe].pdev);
+
+ memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
+ clear_bit(pe, phb->ioda.pe_alloc);
+}
+
+/* Currently those 2 are only used when MSIs are enabled, this will change
+ * but in the meantime, we need to protect them to avoid warnings
+ */
+#ifdef CONFIG_PCI_MSI
+static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+
+ if (!pdn)
+ return NULL;
+ if (pdn->pe_number == IODA_INVALID_PE)
+ return NULL;
+ return &phb->ioda.pe_array[pdn->pe_number];
+}
+
+static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
+{
+ struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev);
+
+ while (!pe && dev->bus->self) {
+ dev = dev->bus->self;
+ pe = __pnv_ioda_get_one_pe(dev);
+ if (pe)
+ pe = pe->bus_pe;
+ }
+ return pe;
+}
+#endif /* CONFIG_PCI_MSI */
+
+static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *parent;
+ uint8_t bcomp, dcomp, fcomp;
+ long rc, rid_end, rid;
+
+ /* Bus validation ? */
+ if (pe->pbus) {
+ int count;
+
+ dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+ parent = pe->pbus->self;
+ count = pe->pbus->subordinate - pe->pbus->secondary + 1;
+ switch(count) {
+ case 1: bcomp = OpalPciBusAll; break;
+ case 2: bcomp = OpalPciBus7Bits; break;
+ case 4: bcomp = OpalPciBus6Bits; break;
+ case 8: bcomp = OpalPciBus5Bits; break;
+ case 16: bcomp = OpalPciBus4Bits; break;
+ case 32: bcomp = OpalPciBus3Bits; break;
+ default:
+ pr_err("%s: Number of subordinate busses %d"
+ " unsupported\n",
+ pci_name(pe->pbus->self), count);
+ /* Do an exact match only */
+ bcomp = OpalPciBusAll;
+ }
+ rid_end = pe->rid + (count << 8);
+ } else {
+ parent = pe->pdev->bus->self;
+ bcomp = OpalPciBusAll;
+ dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+ rid_end = pe->rid + 1;
+ }
+
+ /* Associate PE in PELT */
+ rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+ bcomp, dcomp, fcomp, OPAL_MAP_PE);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+ return -ENXIO;
+ }
+ opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+ /* Add to all parents PELT-V */
+ while (parent) {
+ struct pci_dn *pdn = pnv_ioda_get_pdn(parent);
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+ pe->pe_number, OPAL_ADD_PE_TO_DOMAIN);
+ /* XXX What to do in case of error ? */
+ }
+ parent = parent->bus->self;
+ }
+ /* Setup reverse map */
+ for (rid = pe->rid; rid < rid_end; rid++)
+ phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+ /* Setup one MVTs on IODA1 */
+ if (phb->type == PNV_PHB_IODA1) {
+ pe->mve_number = pe->pe_number;
+ rc = opal_pci_set_mve(phb->opal_id, pe->mve_number,
+ pe->pe_number);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld setting up MVE %d\n",
+ rc, pe->mve_number);
+ pe->mve_number = -1;
+ } else {
+ rc = opal_pci_set_mve_enable(phb->opal_id,
+ pe->mve_number, OPAL_ENABLE_MVE);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld enabling MVE %d\n",
+ rc, pe->mve_number);
+ pe->mve_number = -1;
+ }
+ }
+ } else if (phb->type == PNV_PHB_IODA2)
+ pe->mve_number = 0;
+
+ return 0;
+}
+
+static void __devinit pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
+{
+ struct pnv_ioda_pe *lpe;
+
+ list_for_each_entry(lpe, &phb->ioda.pe_list, link) {
+ if (lpe->dma_weight < pe->dma_weight) {
+ list_add_tail(&pe->link, &lpe->link);
+ return;
+ }
+ }
+ list_add_tail(&pe->link, &phb->ioda.pe_list);
+}
+
+static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
+{
+ /* This is quite simplistic. The "base" weight of a device
+ * is 10. 0 means no DMA is to be accounted for it.
+ */
+
+ /* If it's a bridge, no DMA */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+ return 0;
+
+ /* Reduce the weight of slow USB controllers */
+ if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+ return 3;
+
+ /* Increase the weight of RAID (includes Obsidian) */
+ if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+ return 15;
+
+ /* Default */
+ return 10;
+}
+
+static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+
+ if (!pdn) {
+ pr_err("%s: Device tree node not associated properly\n",
+ pci_name(dev));
+ return NULL;
+ }
+ if (pdn->pe_number != IODA_INVALID_PE)
+ return NULL;
+
+ /* PE#0 has been pre-set */
+ if (dev->bus->number == 0)
+ pe_num = 0;
+ else
+ pe_num = pnv_ioda_alloc_pe(phb);
+ if (pe_num == IODA_INVALID_PE) {
+ pr_warning("%s: Not enough PE# available, disabling device\n",
+ pci_name(dev));
+ return NULL;
+ }
+
+ /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
+ * pointer in the PE data structure, both should be destroyed at the
+ * same time. However, this needs to be looked at more closely again
+ * once we actually start removing things (Hotplug, SR-IOV, ...)
+ *
+ * At some point we want to remove the PDN completely anyways
+ */
+ pe = &phb->ioda.pe_array[pe_num];
+ pci_dev_get(dev);
+ pdn->pcidev = dev;
+ pdn->pe_number = pe_num;
+ pe->pdev = dev;
+ pe->pbus = NULL;
+ pe->tce32_seg = -1;
+ pe->mve_number = -1;
+ pe->rid = dev->bus->number << 8 | pdn->devfn;
+
+ pe_info(pe, "Associated device to PE\n");
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ if (pe_num)
+ pnv_ioda_free_pe(phb, pe_num);
+ pdn->pe_number = IODA_INVALID_PE;
+ pe->pdev = NULL;
+ pci_dev_put(dev);
+ return NULL;
+ }
+
+ /* Assign a DMA weight to the device */
+ pe->dma_weight = pnv_ioda_dma_weight(dev);
+ if (pe->dma_weight != 0) {
+ phb->ioda.dma_weight += pe->dma_weight;
+ phb->ioda.dma_pe_count++;
+ }
+
+ /* Link the PE */
+ pnv_ioda_link_pe_by_weight(phb, pe);
+
+ return pe;
+}
+
+static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+
+ if (pdn == NULL) {
+ pr_warn("%s: No device node associated with device !\n",
+ pci_name(dev));
+ continue;
+ }
+ pci_dev_get(dev);
+ pdn->pcidev = dev;
+ pdn->pe_number = pe->pe_number;
+ pe->dma_weight += pnv_ioda_dma_weight(dev);
+ if (dev->subordinate)
+ pnv_ioda_setup_same_PE(dev->subordinate, pe);
+ }
+}
+
+static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
+ struct pnv_ioda_pe *ppe)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_bus *bus = dev->subordinate;
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+
+ if (!bus) {
+ pr_warning("%s: Bridge without a subordinate bus !\n",
+ pci_name(dev));
+ return;
+ }
+ pe_num = pnv_ioda_alloc_pe(phb);
+ if (pe_num == IODA_INVALID_PE) {
+ pr_warning("%s: Not enough PE# available, disabling bus\n",
+ pci_name(dev));
+ return;
+ }
+
+ pe = &phb->ioda.pe_array[pe_num];
+ ppe->bus_pe = pe;
+ pe->pbus = bus;
+ pe->pdev = NULL;
+ pe->tce32_seg = -1;
+ pe->mve_number = -1;
+ pe->rid = bus->secondary << 8;
+ pe->dma_weight = 0;
+
+ pe_info(pe, "Secondary busses %d..%d associated with PE\n",
+ bus->secondary, bus->subordinate);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ if (pe_num)
+ pnv_ioda_free_pe(phb, pe_num);
+ pe->pbus = NULL;
+ return;
+ }
+
+ /* Associate it with all child devices */
+ pnv_ioda_setup_same_PE(bus, pe);
+
+ /* Account for one DMA PE if at least one DMA capable device exist
+ * below the bridge
+ */
+ if (pe->dma_weight != 0) {
+ phb->ioda.dma_weight += pe->dma_weight;
+ phb->ioda.dma_pe_count++;
+ }
+
+ /* Link the PE */
+ pnv_ioda_link_pe_by_weight(phb, pe);
+}
+
+static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+ struct pnv_ioda_pe *pe;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ pe = pnv_ioda_setup_dev_PE(dev);
+ if (pe == NULL)
+ continue;
+ /* Leaving the PCIe domain ... single PE# */
+ if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+ pnv_ioda_setup_bus_PE(dev, pe);
+ else if (dev->subordinate)
+ pnv_ioda_setup_PEs(dev->subordinate);
+ }
+}
+
+static void __devinit pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb,
+ struct pci_dev *dev)
+{
+ /* We delay DMA setup after we have assigned all PE# */
+}
+
+static void __devinit pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
+ struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ set_iommu_table_base(&dev->dev, &pe->tce32_table);
+ if (dev->subordinate)
+ pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+ }
+}
+
+static void __devinit pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe,
+ unsigned int base,
+ unsigned int segs)
+{
+
+ struct page *tce_mem = NULL;
+ const __be64 *swinvp;
+ struct iommu_table *tbl;
+ unsigned int i;
+ int64_t rc;
+ void *addr;
+
+ /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
+
+ /* XXX FIXME: Handle 64-bit only DMA devices */
+ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
+ /* XXX FIXME: Allocate multi-level tables on PHB3 */
+
+ /* We shouldn't already have a 32-bit DMA associated */
+ if (WARN_ON(pe->tce32_seg >= 0))
+ return;
+
+ /* Grab a 32-bit TCE table */
+ pe->tce32_seg = base;
+ pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
+ (base << 28), ((base + segs) << 28) - 1);
+
+ /* XXX Currently, we allocate one big contiguous table for the
+ * TCEs. We only really need one chunk per 256M of TCE space
+ * (ie per segment) but that's an optimization for later, it
+ * requires some added smarts with our get/put_tce implementation
+ */
+ tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+ get_order(TCE32_TABLE_SIZE * segs));
+ if (!tce_mem) {
+ pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
+ goto fail;
+ }
+ addr = page_address(tce_mem);
+ memset(addr, 0, TCE32_TABLE_SIZE * segs);
+
+ /* Configure HW */
+ for (i = 0; i < segs; i++) {
+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
+ pe->pe_number,
+ base + i, 1,
+ __pa(addr) + TCE32_TABLE_SIZE * i,
+ TCE32_TABLE_SIZE, 0x1000);
+ if (rc) {
+ pe_err(pe, " Failed to configure 32-bit TCE table,"
+ " err %ld\n", rc);
+ goto fail;
+ }
+ }
+
+ /* Setup linux iommu table */
+ tbl = &pe->tce32_table;
+ pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
+ base << 28);
+
+ /* OPAL variant of P7IOC SW invalidated TCEs */
+ swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+ if (swinvp) {
+ /* We need a couple more fields -- an address and a data
+ * to or. Since the bus is only printed out on table free
+ * errors, and on the first pass the data will be a relative
+ * bus number, print that out instead.
+ */
+ tbl->it_busno = 0;
+ tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8);
+ tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE
+ | TCE_PCI_SWINV_PAIR;
+ }
+ iommu_init_table(tbl, phb->hose->node);
+
+ if (pe->pdev)
+ set_iommu_table_base(&pe->pdev->dev, tbl);
+ else
+ pnv_ioda_setup_bus_dma(pe, pe->pbus);
+
+ return;
+ fail:
+ /* XXX Failure: Try to fallback to 64-bit only ? */
+ if (pe->tce32_seg >= 0)
+ pe->tce32_seg = -1;
+ if (tce_mem)
+ __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+}
+
+static void __devinit pnv_ioda_setup_dma(struct pnv_phb *phb)
+{
+ struct pci_controller *hose = phb->hose;
+ unsigned int residual, remaining, segs, tw, base;
+ struct pnv_ioda_pe *pe;
+
+ /* If we have more PE# than segments available, hand out one
+ * per PE until we run out and let the rest fail. If not,
+ * then we assign at least one segment per PE, plus more based
+ * on the amount of devices under that PE
+ */
+ if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
+ residual = 0;
+ else
+ residual = phb->ioda.tce32_count -
+ phb->ioda.dma_pe_count;
+
+ pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
+ hose->global_number, phb->ioda.tce32_count);
+ pr_info("PCI: %d PE# for a total weight of %d\n",
+ phb->ioda.dma_pe_count, phb->ioda.dma_weight);
+
+ /* Walk our PE list and configure their DMA segments, hand them
+ * out one base segment plus any residual segments based on
+ * weight
+ */
+ remaining = phb->ioda.tce32_count;
+ tw = phb->ioda.dma_weight;
+ base = 0;
+ list_for_each_entry(pe, &phb->ioda.pe_list, link) {
+ if (!pe->dma_weight)
+ continue;
+ if (!remaining) {
+ pe_warn(pe, "No DMA32 resources available\n");
+ continue;
+ }
+ segs = 1;
+ if (residual) {
+ segs += ((pe->dma_weight * residual) + (tw / 2)) / tw;
+ if (segs > remaining)
+ segs = remaining;
+ }
+ pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
+ pe->dma_weight, segs);
+ pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+ remaining -= segs;
+ base += segs;
+ }
+}
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int is_64,
+ struct msi_msg *msg)
+{
+ struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+ unsigned int xive_num = hwirq - phb->msi_base;
+ uint64_t addr64;
+ uint32_t addr32, data;
+ int rc;
+
+ /* No PE assigned ? bail out ... no MSI for you ! */
+ if (pe == NULL)
+ return -ENXIO;
+
+ /* Check if we have an MVE */
+ if (pe->mve_number < 0)
+ return -ENXIO;
+
+ /* Assign XIVE to PE */
+ rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+ if (rc) {
+ pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
+ pci_name(dev), rc, xive_num);
+ return -EIO;
+ }
+
+ if (is_64) {
+ rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
+ &addr64, &data);
+ if (rc) {
+ pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
+ pci_name(dev), rc);
+ return -EIO;
+ }
+ msg->address_hi = addr64 >> 32;
+ msg->address_lo = addr64 & 0xfffffffful;
+ } else {
+ rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
+ &addr32, &data);
+ if (rc) {
+ pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
+ pci_name(dev), rc);
+ return -EIO;
+ }
+ msg->address_hi = 0;
+ msg->address_lo = addr32;
+ }
+ msg->data = data;
+
+ pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
+ " address=%x_%08x data=%x PE# %d\n",
+ pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
+ msg->address_hi, msg->address_lo, data, pe->pe_number);
+
+ return 0;
+}
+
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+{
+ unsigned int bmap_size;
+ const __be32 *prop = of_get_property(phb->hose->dn,
+ "ibm,opal-msi-ranges", NULL);
+ if (!prop) {
+ /* BML Fallback */
+ prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
+ }
+ if (!prop)
+ return;
+
+ phb->msi_base = be32_to_cpup(prop);
+ phb->msi_count = be32_to_cpup(prop + 1);
+ bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long);
+ phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL);
+ if (!phb->msi_map) {
+ pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+ phb->hose->global_number);
+ return;
+ }
+ phb->msi_setup = pnv_pci_ioda_msi_setup;
+ phb->msi32_support = 1;
+ pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+ phb->msi_count, phb->msi_base);
+}
+#else
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+/* This is the starting point of our IODA specific resource
+ * allocation process
+ */
+static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose)
+{
+ resource_size_t size, align;
+ struct pci_bus *child;
+
+ /* Associate PEs per functions */
+ pnv_ioda_setup_PEs(hose->bus);
+
+ /* Calculate all resources */
+ pnv_ioda_calc_bus(hose->bus, IORESOURCE_IO, &size, &align);
+ pnv_ioda_calc_bus(hose->bus, IORESOURCE_MEM, &size, &align);
+
+ /* Apply then to HW */
+ pnv_ioda_update_resources(hose->bus);
+
+ /* Setup DMA */
+ pnv_ioda_setup_dma(hose->private_data);
+
+ /* Configure PCI Express settings */
+ list_for_each_entry(child, &hose->bus->children, node) {
+ struct pci_dev *self = child->self;
+ if (!self)
+ continue;
+ pcie_bus_configure_settings(child, self->pcie_mpss);
+ }
+}
+
+/* Prevent enabling devices for which we couldn't properly
+ * assign a PE
+ */
+static int __devinit pnv_pci_enable_device_hook(struct pci_dev *dev)
+{
+ struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return -EINVAL;
+ return 0;
+}
+
+static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
+ u32 devfn)
+{
+ return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
+}
+
+void __init pnv_pci_init_ioda1_phb(struct device_node *np)
+{
+ struct pci_controller *hose;
+ static int primary = 1;
+ struct pnv_phb *phb;
+ unsigned long size, m32map_off, iomap_off, pemap_off;
+ const u64 *prop64;
+ u64 phb_id;
+ void *aux;
+ long rc;
+
+ pr_info(" Initializing IODA OPAL PHB %s\n", np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-phbid\" property !\n");
+ return;
+ }
+ phb_id = be64_to_cpup(prop64);
+ pr_debug(" PHB-ID : 0x%016llx\n", phb_id);
+
+ phb = alloc_bootmem(sizeof(struct pnv_phb));
+ if (phb) {
+ memset(phb, 0, sizeof(struct pnv_phb));
+ phb->hose = hose = pcibios_alloc_controller(np);
+ }
+ if (!phb || !phb->hose) {
+ pr_err("PCI: Failed to allocate PCI controller for %s\n",
+ np->full_name);
+ return;
+ }
+
+ spin_lock_init(&phb->lock);
+ /* XXX Use device-tree */
+ hose->first_busno = 0;
+ hose->last_busno = 0xff;
+ hose->private_data = phb;
+ phb->opal_id = phb_id;
+ phb->type = PNV_PHB_IODA1;
+
+ /* Detect specific models for error handling */
+ if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
+ phb->model = PNV_PHB_MODEL_P7IOC;
+ else
+ phb->model = PNV_PHB_MODEL_UNKNOWN;
+
+ /* We parse "ranges" now since we need to deduce the register base
+ * from the IO base
+ */
+ pci_process_bridge_OF_ranges(phb->hose, np, primary);
+ primary = 0;
+
+ /* Magic formula from Milton */
+ phb->regs = of_iomap(np, 0);
+ if (phb->regs == NULL)
+ pr_err(" Failed to map registers !\n");
+
+
+ /* XXX This is hack-a-thon. This needs to be changed so that:
+ * - we obtain stuff like PE# etc... from device-tree
+ * - we properly re-allocate M32 ourselves
+ * (the OFW one isn't very good)
+ */
+
+ /* Initialize more IODA stuff */
+ phb->ioda.total_pe = 128;
+
+ phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
+ /* OFW Has already off top 64k of M32 space (MSI space) */
+ phb->ioda.m32_size += 0x10000;
+
+ phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
+ phb->ioda.m32_pci_base = hose->mem_resources[0].start -
+ hose->pci_mem_offset;
+ phb->ioda.io_size = hose->pci_io_size;
+ phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
+ phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
+
+ /* Allocate aux data & arrays */
+ size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+ m32map_off = size;
+ size += phb->ioda.total_pe;
+ iomap_off = size;
+ size += phb->ioda.total_pe;
+ pemap_off = size;
+ size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
+ aux = alloc_bootmem(size);
+ memset(aux, 0, size);
+ phb->ioda.pe_alloc = aux;
+ phb->ioda.m32_segmap = aux + m32map_off;
+ phb->ioda.io_segmap = aux + iomap_off;
+ phb->ioda.pe_array = aux + pemap_off;
+ set_bit(0, phb->ioda.pe_alloc);
+
+ INIT_LIST_HEAD(&phb->ioda.pe_list);
+
+ /* Calculate how many 32-bit TCE segments we have */
+ phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+
+ /* Clear unusable m64 */
+ hose->mem_resources[1].flags = 0;
+ hose->mem_resources[1].start = 0;
+ hose->mem_resources[1].end = 0;
+ hose->mem_resources[2].flags = 0;
+ hose->mem_resources[2].start = 0;
+ hose->mem_resources[2].end = 0;
+
+#if 0
+ rc = opal_pci_set_phb_mem_window(opal->phb_id,
+ window_type,
+ window_num,
+ starting_real_address,
+ starting_pci_address,
+ segment_size);
+#endif
+
+ pr_info(" %d PE's M32: 0x%x [segment=0x%x] IO: 0x%x [segment=0x%x]\n",
+ phb->ioda.total_pe,
+ phb->ioda.m32_size, phb->ioda.m32_segsize,
+ phb->ioda.io_size, phb->ioda.io_segsize);
+
+ if (phb->regs) {
+ pr_devel(" BUID = 0x%016llx\n", in_be64(phb->regs + 0x100));
+ pr_devel(" PHB2_CR = 0x%016llx\n", in_be64(phb->regs + 0x160));
+ pr_devel(" IO_BAR = 0x%016llx\n", in_be64(phb->regs + 0x170));
+ pr_devel(" IO_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x178));
+ pr_devel(" IO_SAR = 0x%016llx\n", in_be64(phb->regs + 0x180));
+ pr_devel(" M32_BAR = 0x%016llx\n", in_be64(phb->regs + 0x190));
+ pr_devel(" M32_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x198));
+ pr_devel(" M32_SAR = 0x%016llx\n", in_be64(phb->regs + 0x1a0));
+ }
+ phb->hose->ops = &pnv_pci_ops;
+
+ /* Setup RID -> PE mapping function */
+ phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
+
+ /* Setup TCEs */
+ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
+
+ /* Setup MSI support */
+ pnv_pci_init_ioda_msis(phb);
+
+ /* We set both PCI_PROBE_ONLY and PCI_REASSIGN_ALL_RSRC. This is an
+ * odd combination which essentially means that we skip all resource
+ * fixups and assignments in the generic code, and do it all
+ * ourselves here
+ */
+ ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb;
+ ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
+ pci_add_flags(PCI_PROBE_ONLY | PCI_REASSIGN_ALL_RSRC);
+
+ /* Reset IODA tables to a clean state */
+ rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET);
+ if (rc)
+ pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc);
+ opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE);
+}
+
+void __init pnv_pci_init_ioda_hub(struct device_node *np)
+{
+ struct device_node *phbn;
+ const u64 *prop64;
+ u64 hub_id;
+
+ pr_info("Probing IODA IO-Hub %s\n", np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+ return;
+ }
+ hub_id = be64_to_cpup(prop64);
+ pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
+
+ /* Count child PHBs */
+ for_each_child_of_node(np, phbn) {
+ /* Look for IODA1 PHBs */
+ if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
+ pnv_pci_init_ioda1_phb(phbn);
+ }
+}
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
new file mode 100644
index 00000000..26496777
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -0,0 +1,235 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Currently supports only P5IOC2
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/abs_addr.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* For now, use a fixed amount of TCE memory for each p5ioc2
+ * hub, 16M will do
+ */
+#define P5IOC2_TCE_MEMORY 0x01000000
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int is_64,
+ struct msi_msg *msg)
+{
+ if (WARN_ON(!is_64))
+ return -ENXIO;
+ msg->data = hwirq - phb->msi_base;
+ msg->address_hi = 0x10000000;
+ msg->address_lo = 0;
+
+ return 0;
+}
+
+static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
+{
+ unsigned int bmap_size;
+ const __be32 *prop = of_get_property(phb->hose->dn,
+ "ibm,opal-msi-ranges", NULL);
+ if (!prop)
+ return;
+
+ /* Don't do MSI's on p5ioc2 PCI-X are they are not properly
+ * verified in HW
+ */
+ if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix"))
+ return;
+ phb->msi_base = be32_to_cpup(prop);
+ phb->msi_count = be32_to_cpup(prop + 1);
+ bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long);
+ phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL);
+ if (!phb->msi_map) {
+ pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+ phb->hose->global_number);
+ return;
+ }
+ phb->msi_setup = pnv_pci_p5ioc2_msi_setup;
+ phb->msi32_support = 0;
+ pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+ phb->msi_count, phb->msi_base);
+}
+#else
+static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+static void __devinit pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
+ struct pci_dev *pdev)
+{
+ if (phb->p5ioc2.iommu_table.it_map == NULL)
+ iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
+
+ set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
+}
+
+static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np,
+ void *tce_mem, u64 tce_size)
+{
+ struct pnv_phb *phb;
+ const u64 *prop64;
+ u64 phb_id;
+ int64_t rc;
+ static int primary = 1;
+
+ pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-phbid\" property !\n");
+ return;
+ }
+ phb_id = be64_to_cpup(prop64);
+ pr_devel(" PHB-ID : 0x%016llx\n", phb_id);
+ pr_devel(" TCE AT : 0x%016lx\n", __pa(tce_mem));
+ pr_devel(" TCE SZ : 0x%016llx\n", tce_size);
+
+ rc = opal_pci_set_phb_tce_memory(phb_id, __pa(tce_mem), tce_size);
+ if (rc != OPAL_SUCCESS) {
+ pr_err(" Failed to set TCE memory, OPAL error %lld\n", rc);
+ return;
+ }
+
+ phb = alloc_bootmem(sizeof(struct pnv_phb));
+ if (phb) {
+ memset(phb, 0, sizeof(struct pnv_phb));
+ phb->hose = pcibios_alloc_controller(np);
+ }
+ if (!phb || !phb->hose) {
+ pr_err(" Failed to allocate PCI controller\n");
+ return;
+ }
+
+ spin_lock_init(&phb->lock);
+ phb->hose->first_busno = 0;
+ phb->hose->last_busno = 0xff;
+ phb->hose->private_data = phb;
+ phb->opal_id = phb_id;
+ phb->type = PNV_PHB_P5IOC2;
+ phb->model = PNV_PHB_MODEL_P5IOC2;
+
+ phb->regs = of_iomap(np, 0);
+
+ if (phb->regs == NULL)
+ pr_err(" Failed to map registers !\n");
+ else {
+ pr_devel(" P_BUID = 0x%08x\n", in_be32(phb->regs + 0x100));
+ pr_devel(" P_IOSZ = 0x%08x\n", in_be32(phb->regs + 0x1b0));
+ pr_devel(" P_IO_ST = 0x%08x\n", in_be32(phb->regs + 0x1e0));
+ pr_devel(" P_MEM1_H = 0x%08x\n", in_be32(phb->regs + 0x1a0));
+ pr_devel(" P_MEM1_L = 0x%08x\n", in_be32(phb->regs + 0x190));
+ pr_devel(" P_MSZ1_L = 0x%08x\n", in_be32(phb->regs + 0x1c0));
+ pr_devel(" P_MEM_ST = 0x%08x\n", in_be32(phb->regs + 0x1d0));
+ pr_devel(" P_MEM2_H = 0x%08x\n", in_be32(phb->regs + 0x2c0));
+ pr_devel(" P_MEM2_L = 0x%08x\n", in_be32(phb->regs + 0x2b0));
+ pr_devel(" P_MSZ2_H = 0x%08x\n", in_be32(phb->regs + 0x2d0));
+ pr_devel(" P_MSZ2_L = 0x%08x\n", in_be32(phb->regs + 0x2e0));
+ }
+
+ /* Interpret the "ranges" property */
+ /* This also maps the I/O region and sets isa_io/mem_base */
+ pci_process_bridge_OF_ranges(phb->hose, np, primary);
+ primary = 0;
+
+ phb->hose->ops = &pnv_pci_ops;
+
+ /* Setup MSI support */
+ pnv_pci_init_p5ioc2_msis(phb);
+
+ /* Setup TCEs */
+ phb->dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup;
+ pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
+ tce_mem, tce_size, 0);
+}
+
+void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
+{
+ struct device_node *phbn;
+ const u64 *prop64;
+ u64 hub_id;
+ void *tce_mem;
+ uint64_t tce_per_phb;
+ int64_t rc;
+ int phb_count = 0;
+
+ pr_info("Probing p5ioc2 IO-Hub %s\n", np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+ return;
+ }
+ hub_id = be64_to_cpup(prop64);
+ pr_info(" HUB-ID : 0x%016llx\n", hub_id);
+
+ /* Currently allocate 16M of TCE memory for every Hub
+ *
+ * XXX TODO: Make it chip local if possible
+ */
+ tce_mem = __alloc_bootmem(P5IOC2_TCE_MEMORY, P5IOC2_TCE_MEMORY,
+ __pa(MAX_DMA_ADDRESS));
+ if (!tce_mem) {
+ pr_err(" Failed to allocate TCE Memory !\n");
+ return;
+ }
+ pr_debug(" TCE : 0x%016lx..0x%016lx\n",
+ __pa(tce_mem), __pa(tce_mem) + P5IOC2_TCE_MEMORY - 1);
+ rc = opal_pci_set_hub_tce_memory(hub_id, __pa(tce_mem),
+ P5IOC2_TCE_MEMORY);
+ if (rc != OPAL_SUCCESS) {
+ pr_err(" Failed to allocate TCE memory, OPAL error %lld\n", rc);
+ return;
+ }
+
+ /* Count child PHBs */
+ for_each_child_of_node(np, phbn) {
+ if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
+ of_device_is_compatible(phbn, "ibm,p5ioc2-pciex"))
+ phb_count++;
+ }
+
+ /* Calculate how much TCE space we can give per PHB */
+ tce_per_phb = __rounddown_pow_of_two(P5IOC2_TCE_MEMORY / phb_count);
+ pr_info(" Allocating %lld MB of TCE memory per PHB\n",
+ tce_per_phb >> 20);
+
+ /* Initialize PHBs */
+ for_each_child_of_node(np, phbn) {
+ if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
+ of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) {
+ pnv_pci_init_p5ioc2_phb(phbn, tce_mem, tce_per_phb);
+ tce_mem += tce_per_phb;
+ }
+ }
+}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
new file mode 100644
index 00000000..be3cfc5c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -0,0 +1,609 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Currently supports only P5IOC2
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/abs_addr.h>
+#include <asm/firmware.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* Delay in usec */
+#define PCI_RESET_DELAY_US 3000000
+
+#define cfg_dbg(fmt...) do { } while(0)
+//#define cfg_dbg(fmt...) printk(fmt)
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_msi_check_device(struct pci_dev* pdev, int nvec, int type)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ return (phb && phb->msi_map) ? 0 : -ENODEV;
+}
+
+static unsigned int pnv_get_one_msi(struct pnv_phb *phb)
+{
+ unsigned long flags;
+ unsigned int id, rc;
+
+ spin_lock_irqsave(&phb->lock, flags);
+
+ id = find_next_zero_bit(phb->msi_map, phb->msi_count, phb->msi_next);
+ if (id >= phb->msi_count && phb->msi_next)
+ id = find_next_zero_bit(phb->msi_map, phb->msi_count, 0);
+ if (id >= phb->msi_count) {
+ rc = 0;
+ goto out;
+ }
+ __set_bit(id, phb->msi_map);
+ rc = id + phb->msi_base;
+out:
+ spin_unlock_irqrestore(&phb->lock, flags);
+ return rc;
+}
+
+static void pnv_put_msi(struct pnv_phb *phb, unsigned int hwirq)
+{
+ unsigned long flags;
+ unsigned int id;
+
+ if (WARN_ON(hwirq < phb->msi_base ||
+ hwirq >= (phb->msi_base + phb->msi_count)))
+ return;
+ id = hwirq - phb->msi_base;
+
+ spin_lock_irqsave(&phb->lock, flags);
+ __clear_bit(id, phb->msi_map);
+ spin_unlock_irqrestore(&phb->lock, flags);
+}
+
+static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct msi_desc *entry;
+ struct msi_msg msg;
+ unsigned int hwirq, virq;
+ int rc;
+
+ if (WARN_ON(!phb))
+ return -ENODEV;
+
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
+ pr_warn("%s: Supports only 64-bit MSIs\n",
+ pci_name(pdev));
+ return -ENXIO;
+ }
+ hwirq = pnv_get_one_msi(phb);
+ if (!hwirq) {
+ pr_warn("%s: Failed to find a free MSI\n",
+ pci_name(pdev));
+ return -ENOSPC;
+ }
+ virq = irq_create_mapping(NULL, hwirq);
+ if (virq == NO_IRQ) {
+ pr_warn("%s: Failed to map MSI to linux irq\n",
+ pci_name(pdev));
+ pnv_put_msi(phb, hwirq);
+ return -ENOMEM;
+ }
+ rc = phb->msi_setup(phb, pdev, hwirq, entry->msi_attrib.is_64,
+ &msg);
+ if (rc) {
+ pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
+ irq_dispose_mapping(virq);
+ pnv_put_msi(phb, hwirq);
+ return rc;
+ }
+ irq_set_msi_desc(virq, entry);
+ write_msi_msg(virq, &msg);
+ }
+ return 0;
+}
+
+static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct msi_desc *entry;
+
+ if (WARN_ON(!phb))
+ return;
+
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ if (entry->irq == NO_IRQ)
+ continue;
+ irq_set_msi_desc(entry->irq, NULL);
+ pnv_put_msi(phb, virq_to_hw(entry->irq));
+ irq_dispose_mapping(entry->irq);
+ }
+}
+#endif /* CONFIG_PCI_MSI */
+
+static void pnv_pci_dump_p7ioc_diag_data(struct pnv_phb *phb)
+{
+ struct OpalIoP7IOCPhbErrorData *data = &phb->diag.p7ioc;
+ int i;
+
+ pr_info("PHB %d diagnostic data:\n", phb->hose->global_number);
+
+ pr_info(" brdgCtl = 0x%08x\n", data->brdgCtl);
+
+ pr_info(" portStatusReg = 0x%08x\n", data->portStatusReg);
+ pr_info(" rootCmplxStatus = 0x%08x\n", data->rootCmplxStatus);
+ pr_info(" busAgentStatus = 0x%08x\n", data->busAgentStatus);
+
+ pr_info(" deviceStatus = 0x%08x\n", data->deviceStatus);
+ pr_info(" slotStatus = 0x%08x\n", data->slotStatus);
+ pr_info(" linkStatus = 0x%08x\n", data->linkStatus);
+ pr_info(" devCmdStatus = 0x%08x\n", data->devCmdStatus);
+ pr_info(" devSecStatus = 0x%08x\n", data->devSecStatus);
+
+ pr_info(" rootErrorStatus = 0x%08x\n", data->rootErrorStatus);
+ pr_info(" uncorrErrorStatus = 0x%08x\n", data->uncorrErrorStatus);
+ pr_info(" corrErrorStatus = 0x%08x\n", data->corrErrorStatus);
+ pr_info(" tlpHdr1 = 0x%08x\n", data->tlpHdr1);
+ pr_info(" tlpHdr2 = 0x%08x\n", data->tlpHdr2);
+ pr_info(" tlpHdr3 = 0x%08x\n", data->tlpHdr3);
+ pr_info(" tlpHdr4 = 0x%08x\n", data->tlpHdr4);
+ pr_info(" sourceId = 0x%08x\n", data->sourceId);
+
+ pr_info(" errorClass = 0x%016llx\n", data->errorClass);
+ pr_info(" correlator = 0x%016llx\n", data->correlator);
+
+ pr_info(" p7iocPlssr = 0x%016llx\n", data->p7iocPlssr);
+ pr_info(" p7iocCsr = 0x%016llx\n", data->p7iocCsr);
+ pr_info(" lemFir = 0x%016llx\n", data->lemFir);
+ pr_info(" lemErrorMask = 0x%016llx\n", data->lemErrorMask);
+ pr_info(" lemWOF = 0x%016llx\n", data->lemWOF);
+ pr_info(" phbErrorStatus = 0x%016llx\n", data->phbErrorStatus);
+ pr_info(" phbFirstErrorStatus = 0x%016llx\n", data->phbFirstErrorStatus);
+ pr_info(" phbErrorLog0 = 0x%016llx\n", data->phbErrorLog0);
+ pr_info(" phbErrorLog1 = 0x%016llx\n", data->phbErrorLog1);
+ pr_info(" mmioErrorStatus = 0x%016llx\n", data->mmioErrorStatus);
+ pr_info(" mmioFirstErrorStatus = 0x%016llx\n", data->mmioFirstErrorStatus);
+ pr_info(" mmioErrorLog0 = 0x%016llx\n", data->mmioErrorLog0);
+ pr_info(" mmioErrorLog1 = 0x%016llx\n", data->mmioErrorLog1);
+ pr_info(" dma0ErrorStatus = 0x%016llx\n", data->dma0ErrorStatus);
+ pr_info(" dma0FirstErrorStatus = 0x%016llx\n", data->dma0FirstErrorStatus);
+ pr_info(" dma0ErrorLog0 = 0x%016llx\n", data->dma0ErrorLog0);
+ pr_info(" dma0ErrorLog1 = 0x%016llx\n", data->dma0ErrorLog1);
+ pr_info(" dma1ErrorStatus = 0x%016llx\n", data->dma1ErrorStatus);
+ pr_info(" dma1FirstErrorStatus = 0x%016llx\n", data->dma1FirstErrorStatus);
+ pr_info(" dma1ErrorLog0 = 0x%016llx\n", data->dma1ErrorLog0);
+ pr_info(" dma1ErrorLog1 = 0x%016llx\n", data->dma1ErrorLog1);
+
+ for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
+ if ((data->pestA[i] >> 63) == 0 &&
+ (data->pestB[i] >> 63) == 0)
+ continue;
+ pr_info(" PE[%3d] PESTA = 0x%016llx\n", i, data->pestA[i]);
+ pr_info(" PESTB = 0x%016llx\n", data->pestB[i]);
+ }
+}
+
+static void pnv_pci_dump_phb_diag_data(struct pnv_phb *phb)
+{
+ switch(phb->model) {
+ case PNV_PHB_MODEL_P7IOC:
+ pnv_pci_dump_p7ioc_diag_data(phb);
+ break;
+ default:
+ pr_warning("PCI %d: Can't decode this PHB diag data\n",
+ phb->hose->global_number);
+ }
+}
+
+static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
+{
+ unsigned long flags, rc;
+ int has_diag;
+
+ spin_lock_irqsave(&phb->lock, flags);
+
+ rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
+ has_diag = (rc == OPAL_SUCCESS);
+
+ rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ if (rc) {
+ pr_warning("PCI %d: Failed to clear EEH freeze state"
+ " for PE#%d, err %ld\n",
+ phb->hose->global_number, pe_no, rc);
+
+ /* For now, let's only display the diag buffer when we fail to clear
+ * the EEH status. We'll do more sensible things later when we have
+ * proper EEH support. We need to make sure we don't pollute ourselves
+ * with the normal errors generated when probing empty slots
+ */
+ if (has_diag)
+ pnv_pci_dump_phb_diag_data(phb);
+ else
+ pr_warning("PCI %d: No diag data available\n",
+ phb->hose->global_number);
+ }
+
+ spin_unlock_irqrestore(&phb->lock, flags);
+}
+
+static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus,
+ u32 bdfn)
+{
+ s64 rc;
+ u8 fstate;
+ u16 pcierr;
+ u32 pe_no;
+
+ /* Get PE# if we support IODA */
+ pe_no = phb->bdfn_to_pe ? phb->bdfn_to_pe(phb, bus, bdfn & 0xff) : 0;
+
+ /* Read freeze status */
+ rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr,
+ NULL);
+ if (rc) {
+ pr_warning("PCI %d: Failed to read EEH status for PE#%d,"
+ " err %lld\n", phb->hose->global_number, pe_no, rc);
+ return;
+ }
+ cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n",
+ bdfn, pe_no, fstate);
+ if (fstate != 0)
+ pnv_pci_handle_eeh_config(phb, pe_no);
+}
+
+static int pnv_pci_read_config(struct pci_bus *bus,
+ unsigned int devfn,
+ int where, int size, u32 *val)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ u32 bdfn = (((uint64_t)bus->number) << 8) | devfn;
+ s64 rc;
+
+ if (hose == NULL)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ switch (size) {
+ case 1: {
+ u8 v8;
+ rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8);
+ *val = (rc == OPAL_SUCCESS) ? v8 : 0xff;
+ break;
+ }
+ case 2: {
+ u16 v16;
+ rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where,
+ &v16);
+ *val = (rc == OPAL_SUCCESS) ? v16 : 0xffff;
+ break;
+ }
+ case 4: {
+ u32 v32;
+ rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32);
+ *val = (rc == OPAL_SUCCESS) ? v32 : 0xffffffff;
+ break;
+ }
+ default:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+ }
+ cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n",
+ bus->number, devfn, where, size, *val);
+
+ /* Check if the PHB got frozen due to an error (no response) */
+ pnv_pci_config_check_eeh(phb, bus, bdfn);
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int pnv_pci_write_config(struct pci_bus *bus,
+ unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ u32 bdfn = (((uint64_t)bus->number) << 8) | devfn;
+
+ if (hose == NULL)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ cfg_dbg("pnv_pci_write_config bus: %x devfn: %x +%x/%x -> %08x\n",
+ bus->number, devfn, where, size, val);
+ switch (size) {
+ case 1:
+ opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
+ break;
+ case 2:
+ opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val);
+ break;
+ case 4:
+ opal_pci_config_write_word(phb->opal_id, bdfn, where, val);
+ break;
+ default:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+ }
+ /* Check if the PHB got frozen due to an error (no response) */
+ pnv_pci_config_check_eeh(phb, bus, bdfn);
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+struct pci_ops pnv_pci_ops = {
+ .read = pnv_pci_read_config,
+ .write = pnv_pci_write_config,
+};
+
+
+static void pnv_tce_invalidate(struct iommu_table *tbl,
+ u64 *startp, u64 *endp)
+{
+ u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index;
+ unsigned long start, end, inc;
+
+ start = __pa(startp);
+ end = __pa(endp);
+
+
+ /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
+ if (tbl->it_busno) {
+ start <<= 12;
+ end <<= 12;
+ inc = 128 << 12;
+ start |= tbl->it_busno;
+ end |= tbl->it_busno;
+ }
+ /* p7ioc-style invalidation, 2 TCEs per write */
+ else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
+ start |= (1ull << 63);
+ end |= (1ull << 63);
+ inc = 16;
+ }
+ /* Default (older HW) */
+ else
+ inc = 128;
+
+ end |= inc - 1; /* round up end to be different than start */
+
+ mb(); /* Ensure above stores are visible */
+ while (start <= end) {
+ __raw_writeq(start, invalidate);
+ start += inc;
+ }
+ /* The iommu layer will do another mb() for us on build() and
+ * we don't care on free()
+ */
+}
+
+
+static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+ unsigned long uaddr, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ u64 proto_tce;
+ u64 *tcep, *tces;
+ u64 rpn;
+
+ proto_tce = TCE_PCI_READ; // Read allowed
+
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset;
+ rpn = __pa(uaddr) >> TCE_SHIFT;
+
+ while (npages--)
+ *(tcep++) = proto_tce | (rpn++ << TCE_RPN_SHIFT);
+
+ /* Some implementations won't cache invalid TCEs and thus may not
+ * need that flush. We'll probably turn it_type into a bit mask
+ * of flags if that becomes the case
+ */
+ if (tbl->it_type & TCE_PCI_SWINV_CREATE)
+ pnv_tce_invalidate(tbl, tces, tcep - 1);
+
+ return 0;
+}
+
+static void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
+{
+ u64 *tcep, *tces;
+
+ tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset;
+
+ while (npages--)
+ *(tcep++) = 0;
+
+ if (tbl->it_type & TCE_PCI_SWINV_FREE)
+ pnv_tce_invalidate(tbl, tces, tcep - 1);
+}
+
+void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+ void *tce_mem, u64 tce_size,
+ u64 dma_offset)
+{
+ tbl->it_blocksize = 16;
+ tbl->it_base = (unsigned long)tce_mem;
+ tbl->it_offset = dma_offset >> IOMMU_PAGE_SHIFT;
+ tbl->it_index = 0;
+ tbl->it_size = tce_size >> 3;
+ tbl->it_busno = 0;
+ tbl->it_type = TCE_PCI;
+}
+
+static struct iommu_table * __devinit
+pnv_pci_setup_bml_iommu(struct pci_controller *hose)
+{
+ struct iommu_table *tbl;
+ const __be64 *basep, *swinvp;
+ const __be32 *sizep;
+
+ basep = of_get_property(hose->dn, "linux,tce-base", NULL);
+ sizep = of_get_property(hose->dn, "linux,tce-size", NULL);
+ if (basep == NULL || sizep == NULL) {
+ pr_err("PCI: %s has missing tce entries !\n",
+ hose->dn->full_name);
+ return NULL;
+ }
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose->node);
+ if (WARN_ON(!tbl))
+ return NULL;
+ pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
+ be32_to_cpup(sizep), 0);
+ iommu_init_table(tbl, hose->node);
+
+ /* Deal with SW invalidated TCEs when needed (BML way) */
+ swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info",
+ NULL);
+ if (swinvp) {
+ tbl->it_busno = swinvp[1];
+ tbl->it_index = (unsigned long)ioremap(swinvp[0], 8);
+ tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE;
+ }
+ return tbl;
+}
+
+static void __devinit pnv_pci_dma_fallback_setup(struct pci_controller *hose,
+ struct pci_dev *pdev)
+{
+ struct device_node *np = pci_bus_to_OF_node(hose->bus);
+ struct pci_dn *pdn;
+
+ if (np == NULL)
+ return;
+ pdn = PCI_DN(np);
+ if (!pdn->iommu_table)
+ pdn->iommu_table = pnv_pci_setup_bml_iommu(hose);
+ if (!pdn->iommu_table)
+ return;
+ set_iommu_table_base(&pdev->dev, pdn->iommu_table);
+}
+
+static void __devinit pnv_pci_dma_dev_setup(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ /* If we have no phb structure, try to setup a fallback based on
+ * the device-tree (RTAS PCI for example)
+ */
+ if (phb && phb->dma_dev_setup)
+ phb->dma_dev_setup(phb, pdev);
+ else
+ pnv_pci_dma_fallback_setup(hose, pdev);
+}
+
+/* Fixup wrong class code in p7ioc root complex */
+static void __devinit pnv_p7ioc_rc_quirk(struct pci_dev *dev)
+{
+ dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
+
+static int pnv_pci_probe_mode(struct pci_bus *bus)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ const __be64 *tstamp;
+ u64 now, target;
+
+
+ /* We hijack this as a way to ensure we have waited long
+ * enough since the reset was lifted on the PCI bus
+ */
+ if (bus != hose->bus)
+ return PCI_PROBE_NORMAL;
+ tstamp = of_get_property(hose->dn, "reset-clear-timestamp", NULL);
+ if (!tstamp || !*tstamp)
+ return PCI_PROBE_NORMAL;
+
+ now = mftb() / tb_ticks_per_usec;
+ target = (be64_to_cpup(tstamp) / tb_ticks_per_usec)
+ + PCI_RESET_DELAY_US;
+
+ pr_devel("pci %04d: Reset target: 0x%llx now: 0x%llx\n",
+ hose->global_number, target, now);
+
+ if (now < target)
+ msleep((target - now + 999) / 1000);
+
+ return PCI_PROBE_NORMAL;
+}
+
+void __init pnv_pci_init(void)
+{
+ struct device_node *np;
+
+ pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
+
+ /* OPAL absent, try POPAL first then RTAS detection of PHBs */
+ if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+#ifdef CONFIG_PPC_POWERNV_RTAS
+ init_pci_config_tokens();
+ find_and_init_phbs();
+#endif /* CONFIG_PPC_POWERNV_RTAS */
+ }
+ /* OPAL is here, do our normal stuff */
+ else {
+ int found_ioda = 0;
+
+ /* Look for IODA IO-Hubs. We don't support mixing IODA
+ * and p5ioc2 due to the need to change some global
+ * probing flags
+ */
+ for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
+ pnv_pci_init_ioda_hub(np);
+ found_ioda = 1;
+ }
+
+ /* Look for p5ioc2 IO-Hubs */
+ if (!found_ioda)
+ for_each_compatible_node(np, NULL, "ibm,p5ioc2")
+ pnv_pci_init_p5ioc2_hub(np);
+ }
+
+ /* Setup the linkage between OF nodes and PHBs */
+ pci_devs_phb_init();
+
+ /* Configure IOMMU DMA hooks */
+ ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
+ ppc_md.tce_build = pnv_tce_build;
+ ppc_md.tce_free = pnv_tce_free;
+ ppc_md.pci_probe_mode = pnv_pci_probe_mode;
+ set_pci_dma_ops(&dma_iommu_ops);
+
+ /* Configure MSIs */
+#ifdef CONFIG_PCI_MSI
+ ppc_md.msi_check_device = pnv_msi_check_device;
+ ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
+ ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
+#endif
+}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
new file mode 100644
index 00000000..8bc47963
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -0,0 +1,148 @@
+#ifndef __POWERNV_PCI_H
+#define __POWERNV_PCI_H
+
+struct pci_dn;
+
+enum pnv_phb_type {
+ PNV_PHB_P5IOC2,
+ PNV_PHB_IODA1,
+ PNV_PHB_IODA2,
+};
+
+/* Precise PHB model for error management */
+enum pnv_phb_model {
+ PNV_PHB_MODEL_UNKNOWN,
+ PNV_PHB_MODEL_P5IOC2,
+ PNV_PHB_MODEL_P7IOC,
+};
+
+#define PNV_PCI_DIAG_BUF_SIZE 4096
+
+/* Data associated with a PE, including IOMMU tracking etc.. */
+struct pnv_ioda_pe {
+ /* A PE can be associated with a single device or an
+ * entire bus (& children). In the former case, pdev
+ * is populated, in the later case, pbus is.
+ */
+ struct pci_dev *pdev;
+ struct pci_bus *pbus;
+
+ /* Effective RID (device RID for a device PE and base bus
+ * RID with devfn 0 for a bus PE)
+ */
+ unsigned int rid;
+
+ /* PE number */
+ unsigned int pe_number;
+
+ /* "Weight" assigned to the PE for the sake of DMA resource
+ * allocations
+ */
+ unsigned int dma_weight;
+
+ /* This is a PCI-E -> PCI-X bridge, this points to the
+ * corresponding bus PE
+ */
+ struct pnv_ioda_pe *bus_pe;
+
+ /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
+ int tce32_seg;
+ int tce32_segcount;
+ struct iommu_table tce32_table;
+
+ /* XXX TODO: Add support for additional 64-bit iommus */
+
+ /* MSIs. MVE index is identical for for 32 and 64 bit MSI
+ * and -1 if not supported. (It's actually identical to the
+ * PE number)
+ */
+ int mve_number;
+
+ /* Link in list of PE#s */
+ struct list_head link;
+};
+
+struct pnv_phb {
+ struct pci_controller *hose;
+ enum pnv_phb_type type;
+ enum pnv_phb_model model;
+ u64 opal_id;
+ void __iomem *regs;
+ spinlock_t lock;
+
+#ifdef CONFIG_PCI_MSI
+ unsigned long *msi_map;
+ unsigned int msi_base;
+ unsigned int msi_count;
+ unsigned int msi_next;
+ unsigned int msi32_support;
+#endif
+ int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int is_64,
+ struct msi_msg *msg);
+ void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
+ void (*fixup_phb)(struct pci_controller *hose);
+ u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
+
+ union {
+ struct {
+ struct iommu_table iommu_table;
+ } p5ioc2;
+
+ struct {
+ /* Global bridge info */
+ unsigned int total_pe;
+ unsigned int m32_size;
+ unsigned int m32_segsize;
+ unsigned int m32_pci_base;
+ unsigned int io_size;
+ unsigned int io_segsize;
+ unsigned int io_pci_base;
+
+ /* PE allocation bitmap */
+ unsigned long *pe_alloc;
+
+ /* M32 & IO segment maps */
+ unsigned int *m32_segmap;
+ unsigned int *io_segmap;
+ struct pnv_ioda_pe *pe_array;
+
+ /* Reverse map of PEs, will have to extend if
+ * we are to support more than 256 PEs, indexed
+ * bus { bus, devfn }
+ */
+ unsigned char pe_rmap[0x10000];
+
+ /* 32-bit TCE tables allocation */
+ unsigned long tce32_count;
+
+ /* Total "weight" for the sake of DMA resources
+ * allocation
+ */
+ unsigned int dma_weight;
+ unsigned int dma_pe_count;
+
+ /* Sorted list of used PE's, sorted at
+ * boot for resource allocation purposes
+ */
+ struct list_head pe_list;
+ } ioda;
+ };
+
+ /* PHB status structure */
+ union {
+ unsigned char blob[PNV_PCI_DIAG_BUF_SIZE];
+ struct OpalIoP7IOCPhbErrorData p7ioc;
+ } diag;
+};
+
+extern struct pci_ops pnv_pci_ops;
+
+extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+ void *tce_mem, u64 tce_size,
+ u64 dma_offset);
+extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
+extern void pnv_pci_init_ioda_hub(struct device_node *np);
+
+
+#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
new file mode 100644
index 00000000..8a9df7f9
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -0,0 +1,16 @@
+#ifndef _POWERNV_H
+#define _POWERNV_H
+
+#ifdef CONFIG_SMP
+extern void pnv_smp_init(void);
+#else
+static inline void pnv_smp_init(void) { }
+#endif
+
+#ifdef CONFIG_PCI
+extern void pnv_pci_init(void);
+#else
+static inline void pnv_pci_init(void) { }
+#endif
+
+#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
new file mode 100644
index 00000000..db1ad1c8
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -0,0 +1,195 @@
+/*
+ * PowerNV setup code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/tty.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/interrupt.h>
+#include <linux/bug.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/rtas.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+
+static void __init pnv_setup_arch(void)
+{
+ /* Initialize SMP */
+ pnv_smp_init();
+
+ /* Setup PCI */
+ pnv_pci_init();
+
+ /* Setup RTC and NVRAM callbacks */
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ opal_nvram_init();
+
+ /* Enable NAP mode */
+ powersave_nap = 1;
+
+ /* XXX PMCS */
+}
+
+static void __init pnv_init_early(void)
+{
+#ifdef CONFIG_HVC_OPAL
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ hvc_opal_init_early();
+ else
+#endif
+ add_preferred_console("hvc", 0, NULL);
+}
+
+static void __init pnv_init_IRQ(void)
+{
+ xics_init();
+
+ WARN_ON(!ppc_md.get_irq);
+}
+
+static void pnv_show_cpuinfo(struct seq_file *m)
+{
+ struct device_node *root;
+ const char *model = "";
+
+ root = of_find_node_by_path("/");
+ if (root)
+ model = of_get_property(root, "model", NULL);
+ seq_printf(m, "machine\t\t: PowerNV %s\n", model);
+ if (firmware_has_feature(FW_FEATURE_OPALv2))
+ seq_printf(m, "firmware\t: OPAL v2\n");
+ else if (firmware_has_feature(FW_FEATURE_OPAL))
+ seq_printf(m, "firmware\t: OPAL v1\n");
+ else
+ seq_printf(m, "firmware\t: BML\n");
+ of_node_put(root);
+}
+
+static void __noreturn pnv_restart(char *cmd)
+{
+ long rc = OPAL_BUSY;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_reboot();
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_power_off(void)
+{
+ long rc = OPAL_BUSY;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_power_down(0);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_halt(void)
+{
+ pnv_power_off();
+}
+
+static void pnv_progress(char *s, unsigned short hex)
+{
+}
+
+#ifdef CONFIG_KEXEC
+static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+ xics_kexec_teardown_cpu(secondary);
+}
+#endif /* CONFIG_KEXEC */
+
+static void __init pnv_setup_machdep_opal(void)
+{
+ ppc_md.get_boot_time = opal_get_boot_time;
+ ppc_md.get_rtc_time = opal_get_rtc_time;
+ ppc_md.set_rtc_time = opal_set_rtc_time;
+ ppc_md.restart = pnv_restart;
+ ppc_md.power_off = pnv_power_off;
+ ppc_md.halt = pnv_halt;
+ ppc_md.machine_check_exception = opal_machine_check;
+}
+
+#ifdef CONFIG_PPC_POWERNV_RTAS
+static void __init pnv_setup_machdep_rtas(void)
+{
+ if (rtas_token("get-time-of-day") != RTAS_UNKNOWN_SERVICE) {
+ ppc_md.get_boot_time = rtas_get_boot_time;
+ ppc_md.get_rtc_time = rtas_get_rtc_time;
+ ppc_md.set_rtc_time = rtas_set_rtc_time;
+ }
+ ppc_md.restart = rtas_restart;
+ ppc_md.power_off = rtas_power_off;
+ ppc_md.halt = rtas_halt;
+}
+#endif /* CONFIG_PPC_POWERNV_RTAS */
+
+static int __init pnv_probe(void)
+{
+ unsigned long root = of_get_flat_dt_root();
+
+ if (!of_flat_dt_is_compatible(root, "ibm,powernv"))
+ return 0;
+
+ hpte_init_native();
+
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ pnv_setup_machdep_opal();
+#ifdef CONFIG_PPC_POWERNV_RTAS
+ else if (rtas.base)
+ pnv_setup_machdep_rtas();
+#endif /* CONFIG_PPC_POWERNV_RTAS */
+
+ pr_debug("PowerNV detected !\n");
+
+ return 1;
+}
+
+define_machine(powernv) {
+ .name = "PowerNV",
+ .probe = pnv_probe,
+ .init_early = pnv_init_early,
+ .setup_arch = pnv_setup_arch,
+ .init_IRQ = pnv_init_IRQ,
+ .show_cpuinfo = pnv_show_cpuinfo,
+ .progress = pnv_progress,
+ .power_save = power7_idle,
+ .calibrate_decr = generic_calibrate_decr,
+#ifdef CONFIG_KEXEC
+ .kexec_cpu_down = pnv_kexec_cpu_down,
+#endif
+};
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
new file mode 100644
index 00000000..3ef46254
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -0,0 +1,181 @@
+/*
+ * SMP support for PowerNV machines.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+
+#include <asm/irq.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+static void __cpuinit pnv_smp_setup_cpu(int cpu)
+{
+ if (cpu != boot_cpuid)
+ xics_setup_cpu();
+}
+
+static int pnv_smp_cpu_bootable(unsigned int nr)
+{
+ /* Special case - we inhibit secondary thread startup
+ * during boot if the user requests it.
+ */
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
+ return 0;
+ if (smt_enabled_at_boot
+ && cpu_thread_in_core(nr) >= smt_enabled_at_boot)
+ return 0;
+ }
+
+ return 1;
+}
+
+int __devinit pnv_smp_kick_cpu(int nr)
+{
+ unsigned int pcpu = get_hard_smp_processor_id(nr);
+ unsigned long start_here = __pa(*((unsigned long *)
+ generic_secondary_smp_init));
+ long rc;
+
+ BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+ /* On OPAL v2 the CPU are still spinning inside OPAL itself,
+ * get them back now
+ */
+ if (!paca[nr].cpu_start && firmware_has_feature(FW_FEATURE_OPALv2)) {
+ pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu);
+ rc = opal_start_cpu(pcpu, start_here);
+ if (rc != OPAL_SUCCESS)
+ pr_warn("OPAL Error %ld starting CPU %d\n",
+ rc, nr);
+ }
+ return smp_generic_kick_cpu(nr);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int pnv_smp_cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ /* This is identical to pSeries... might consolidate by
+ * moving migrate_irqs_away to a ppc_md with default to
+ * the generic fixup_irqs. --BenH.
+ */
+ set_cpu_online(cpu, false);
+ vdso_data->processorCount--;
+ if (cpu == boot_cpuid)
+ boot_cpuid = cpumask_any(cpu_online_mask);
+ xics_migrate_irqs_away();
+ return 0;
+}
+
+static void pnv_smp_cpu_kill_self(void)
+{
+ unsigned int cpu;
+
+ /* If powersave_nap is enabled, use NAP mode, else just
+ * spin aimlessly
+ */
+ if (!powersave_nap) {
+ generic_mach_cpu_die();
+ return;
+ }
+
+ /* Standard hot unplug procedure */
+ local_irq_disable();
+ idle_task_exit();
+ current->active_mm = NULL; /* for sanity */
+ cpu = smp_processor_id();
+ DBG("CPU%d offline\n", cpu);
+ generic_set_cpu_dead(cpu);
+ smp_wmb();
+
+ /* We don't want to take decrementer interrupts while we are offline,
+ * so clear LPCR:PECE1. We keep PECE2 enabled.
+ */
+ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1);
+ while (!generic_check_cpu_restart(cpu)) {
+ power7_idle();
+ if (!generic_check_cpu_restart(cpu)) {
+ DBG("CPU%d Unexpected exit while offline !\n", cpu);
+ /* We may be getting an IPI, so we re-enable
+ * interrupts to process it, it will be ignored
+ * since we aren't online (hopefully)
+ */
+ local_irq_enable();
+ local_irq_disable();
+ }
+ }
+ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1);
+ DBG("CPU%d coming online...\n", cpu);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct smp_ops_t pnv_smp_ops = {
+ .message_pass = smp_muxed_ipi_message_pass,
+ .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
+ .probe = xics_smp_probe,
+ .kick_cpu = pnv_smp_kick_cpu,
+ .setup_cpu = pnv_smp_setup_cpu,
+ .cpu_bootable = pnv_smp_cpu_bootable,
+#ifdef CONFIG_HOTPLUG_CPU
+ .cpu_disable = pnv_smp_cpu_disable,
+ .cpu_die = generic_cpu_die,
+#endif /* CONFIG_HOTPLUG_CPU */
+};
+
+/* This is called very early during platform setup_arch */
+void __init pnv_smp_init(void)
+{
+ smp_ops = &pnv_smp_ops;
+
+ /* XXX We don't yet have a proper entry point from HAL, for
+ * now we rely on kexec-style entry from BML
+ */
+
+#ifdef CONFIG_PPC_RTAS
+ /* Non-lpar has additional take/give timebase */
+ if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+ smp_ops->give_timebase = rtas_give_timebase;
+ smp_ops->take_timebase = rtas_take_timebase;
+ }
+#endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_HOTPLUG_CPU
+ ppc_md.cpu_die = pnv_smp_cpu_kill_self;
+#endif
+}