Home
Reading
Searching
Subscribe
Sponsors
Statistics
Posting
Contact
Spam
Lists
Links
About
Hosting
Filtering
Features Download
Marketing
Archives
FAQ
Blog
 
Gmane
From: Mark Gross <mgross <at> linux.intel.com>
Subject: [RFC] [PATCH] Power Managed memory base enabling
Newsgroups: gmane.linux.power-management.general
Date: Monday 5th March 2007 18:18:26 UTC (over 10 years ago)
The following patch is to help enable both allocation and access based
memory power optimization policies for systems that have the capability
to put sticks of memory selectively into lower power states based on
workload requirement.

To be clear PM-memory will not be useful unless you have workloads that
can take advantage of it.  The identified workloads are not desktop
workloads.  However; there is a non-zero number of interested users with
applicable workloads that make pushing the enabling patches out to the
community worth while.  These workloads tend to be within network
elements and servers where memory utilization tracks traffic load.  

This patch is very simple.  It is independent of the ongoing
anti-fragmentation, page migration and hot remove activities.  It should
coexist and be complimentary to those.

The goals of this patch are:
* provide a method for identifying power managed memory that could change
state at runtime under policy manager and OS control.
* avoid start up allocations from putting kernel memory structures into
such memory at boot time.
* be minimal and transparent for platforms without such memory.

It makes use of the existing NUMA implementation.  

It implements a convention on the 4 bytes of "Proximity Domain ID"
within the SRAT memory affinity structure as defined in ACPI3.0a.  If
bit 31 is set, then the memory range represented by that PXM is assumed
to be power managed.  We are working on defining a "standard" for
identifying such memory areas as power manageable and progress committee
based.  

We are going with the above convention for the time being as it doesn't
violate ACPI specifications.  Some time in the future after the
committees are finished some of this code may need to be updated.

To exercise the capability on a platform with PM-memory, you will still
need to include a policy manager with some code to trigger the state
changes to enable transition into and out of a low power state. 

More will be done, but for now we would like to get this base enabling
into the upstream kernel as an initial step.

Thanks,

--mgross 

Signed-off-by: Mark Gross 



diff -urN -X linux-2.6.20-mm2/Documentation/dontdiff
linux-2.6.20-mm2/arch/x86_64/mm/numa.c
linux-2.6.20-mm2-monroe/arch/x86_64/mm/numa.c
--- linux-2.6.20-mm2/arch/x86_64/mm/numa.c	2007-02-23 11:20:38.000000000
-0800
+++ linux-2.6.20-mm2-monroe/arch/x86_64/mm/numa.c	2007-03-02
15:15:53.000000000 -0800
@@ -156,12 +156,55 @@
 }
 #endif
 
+/* we need a place to save the next start address to use for each node
because
+ * we need to allocate the pgdata and bootmem for power managed memory in
+ * non-power managed nodes.  We do this by saving off where we can start
+ * allocating in the nodes and updating them as the boot up proceeds.
+ */
+static unsigned long bootmem_start[MAX_NUMNODES];
+
+
 static void * __init
 early_node_mem(int nodeid, unsigned long start, unsigned long end,
 	      unsigned long size)
 {
-	unsigned long mem = find_e820_area(start, end, size);
+	unsigned long mem;
 	void *ptr;
+	if (bootmem_start[nodeid] <= start) {
+		bootmem_start[nodeid] = start;
+	}
+
+	mem = -1L;
+	if (power_managed_node(nodeid)) {
+		int non_pm_node = find_closest_non_pm_node(nodeid);
+
+		if (!node_online(non_pm_node)) {
+			return NULL; /* expect nodeid to get setup on the next
+					pass of setup_node_boot_mem after
+					non_pm_node is online*/
+		} else {
+			/* We set up the allocation in the non_pm_node
+			 * get the end of non_pm_node boot allocations
+			 * allocate from there.
+			 */
+			unsigned int non_pm_end;
+
+			non_pm_end = (NODE_DATA(non_pm_node)->node_start_pfn +
+				NODE_DATA(non_pm_node)->node_spanned_pages)
+					<< PAGE_SHIFT;
+
+			mem = find_e820_area(bootmem_start[non_pm_node],
+					non_pm_end, size);
+			/* now increment bootmem_start for next call */
+			if (mem!= -1L)
+				bootmem_start[non_pm_node] =
+					round_up(mem + size, PAGE_SIZE);
+		}
+	} else {
+		mem = find_e820_area(bootmem_start[nodeid], end, size);
+		if (mem!= -1L)
+			bootmem_start[nodeid] = round_up(mem + size, PAGE_SIZE);
+	}	
 	if (mem != -1L)
 		return __va(mem);
 	ptr = __alloc_bootmem_nopanic(size,
@@ -180,6 +223,7 @@
 	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size,
bootmap_start; 
 	unsigned long nodedata_phys;
 	void *bootmap;
+	int temp_id;
 	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
 
 	start = round_up(start, ZONE_ALIGN); 
@@ -219,8 +263,13 @@
 
 	free_bootmem_with_active_regions(nodeid, end);
 
-	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
-	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
bootmap_pages<node_mem_map = 
-		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
-				memmapsize, SMP_CACHE_BYTES, 
-				round_down(limit - memmapsize, PAGE_SIZE), 
+	if (power_managed_node(nodeid)) {
+		int non_pm_node = find_closest_non_pm_node(nodeid);
+
+		NODE_DATA(nodeid)->node_mem_map =
+			__alloc_bootmem_core(NODE_DATA(non_pm_node)->bdata,
+				memmapsize, SMP_CACHE_BYTES,
+				round_down(limit - memmapsize, PAGE_SIZE),
 				limit);
+	} else {
+ 		NODE_DATA(nodeid)->node_mem_map =
+			__alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
+				memmapsize, SMP_CACHE_BYTES,
+				round_down(limit - memmapsize, PAGE_SIZE),
+				limit);
+	}
 	printk(KERN_DEBUG "Node %d memmap at 0x%p size %lu first pfn 0x%p\n",
 			nodeid, NODE_DATA(nodeid)->node_mem_map,
 			memmapsize, NODE_DATA(nodeid)->node_mem_map);
@@ -266,7 +325,10 @@
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
- 		numa_set_node(i, rr);
+		if (power_managed_node(rr))
+			numa_set_node(i,find_closest_non_pm_node(rr));
+		else
+			numa_set_node(i, rr);
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
diff -urN -X linux-2.6.20-mm2/Documentation/dontdiff
linux-2.6.20-mm2/arch/x86_64/mm/srat.c
linux-2.6.20-mm2-monroe/arch/x86_64/mm/srat.c
--- linux-2.6.20-mm2/arch/x86_64/mm/srat.c	2007-02-23 11:20:38.000000000
-0800
+++ linux-2.6.20-mm2-monroe/arch/x86_64/mm/srat.c	2007-03-02
15:15:53.000000000 -0800
@@ -28,6 +28,7 @@
 static nodemask_t nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
+static int pm_node[MAX_NUMNODES];
 static int found_add_area __initdata;
 int hotadd_percent __initdata = 0;
 
@@ -299,7 +300,11 @@
 		return;
 	start = ma->base_address;
 	end = start + ma->length;
-	pxm = ma->proximity_domain;
+	pxm = ma->proximity_domain & 0x0000ffff;
+	if (ma->proximity_domain & (1<<31))
+		pm_node[pxm] = 1;
+	else
+		pm_node[pxm] = 0;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
@@ -467,8 +472,6 @@
 	return acpi_slit->entry[index + node_to_pxm(b)];
 }
 
-EXPORT_SYMBOL(__node_distance);
-
 int memory_add_physaddr_to_nid(u64 start)
 {
 	int i, ret = 0;
@@ -479,5 +482,36 @@
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
+int __power_managed_node(int srat_node)
+{
+	return pm_node[node_to_pxm(srat_node)];
+}
+
+int __power_managed_memory_present(void)
+{
+	int j;
+
+	for (j=0; jbdata, size,
+				align, goal, 0)))
+			return(ptr);
 	}
 	return NULL;
 }
@@ -463,16 +464,15 @@
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 				  unsigned long goal)
 {
-	bootmem_data_t *bdata;
 	void *ptr;
+	int i;
 
-	list_for_each_entry(bdata, &bdata_list, list) {
-		ptr = __alloc_bootmem_core(bdata, size, align, goal,
-						ARCH_LOW_ADDRESS_LIMIT);
-		if (ptr)
-			return ptr;
+	for_each_online_node(i) {
+		if ((!power_managed_node(i)) && (ptr =
+			__alloc_bootmem_core(NODE_DATA(i)->bdata, size,
+				align, goal, ARCH_LOW_ADDRESS_LIMIT)))
+			return(ptr);
 	}
-
 	/*
 	 * Whoops, we cannot satisfy the allocation request.
 	 */
diff -urN -X linux-2.6.20-mm2/Documentation/dontdiff
linux-2.6.20-mm2/mm/memory.c linux-2.6.20-mm2-monroe/mm/memory.c
--- linux-2.6.20-mm2/mm/memory.c	2007-02-23 11:20:40.000000000 -0800
+++ linux-2.6.20-mm2-monroe/mm/memory.c	2007-03-02 15:15:53.000000000 -0800
@@ -2882,3 +2882,29 @@
 	return buf - old_buf;
 }
 EXPORT_SYMBOL_GPL(access_process_vm);
+
+#ifdef __x86_64__
+extern int __power_managed_memory_present(void);
+extern int __power_managed_node(int srat_node);
+extern int __find_closest_non_pm_node(int nodeid);
+#else
+inline int __power_managed_memory_present(void) { return 0};
+inline int __power_managed_node(int srat_node) { return 0};
+inline int __find_closest_non_pm_node(int nodeid) { return nodeid};
+#endif
+
+int power_managed_memory_present(void)
+{
+	return __power_managed_memory_present();
+}
+
+int power_managed_node(int srat_node)
+{
+	return __power_managed_node(srat_node);
+}
+
+int find_closest_non_pm_node(int nodeid)
+{
+	return __find_closest_non_pm_node(nodeid);
+}
+
diff -urN -X linux-2.6.20-mm2/Documentation/dontdiff
linux-2.6.20-mm2/mm/mempolicy.c linux-2.6.20-mm2-monroe/mm/mempolicy.c
--- linux-2.6.20-mm2/mm/mempolicy.c	2007-02-23 11:20:40.000000000 -0800
+++ linux-2.6.20-mm2-monroe/mm/mempolicy.c	2007-03-02 15:15:53.000000000
-0800
@@ -1617,8 +1617,13 @@
 	/* Set interleaving policy for system init. This way not all
 	   the data structures allocated at system boot end up in node zero. */
 
-	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
-		printk("numa_policy_init: interleaving failed\n");
+	if (power_managed_memory_present()) {
+		if (do_set_mempolicy(MPOL_DEFAULT, &node_online_map))
+			printk("numa_policy_init: interleaving failed\n");
+	} else {
+		if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+			printk("numa_policy_init: interleaving failed\n");
+	}
 }
 
 /* Reset policy of current process to default */
diff -urN -X linux-2.6.20-mm2/Documentation/dontdiff
linux-2.6.20-mm2/mm/page_alloc.c linux-2.6.20-mm2-monroe/mm/page_alloc.c
--- linux-2.6.20-mm2/mm/page_alloc.c	2007-02-23 11:20:40.000000000 -0800
+++ linux-2.6.20-mm2-monroe/mm/page_alloc.c	2007-03-02 15:15:53.000000000
-0800
@@ -2308,8 +2308,17 @@
 					* sizeof(wait_queue_head_t);
 
  	if (system_state == SYSTEM_BOOTING) {
-		zone->wait_table = (wait_queue_head_t *)
-			alloc_bootmem_node(pgdat, alloc_size);
+		if(power_managed_node(pgdat->node_id)) {
+			int nid;
+
+			nid = find_closest_non_pm_node(pgdat->node_id);
+			zone->wait_table = (wait_queue_head_t *)
+				alloc_bootmem_node(NODE_DATA(nid), alloc_size);
+		} else
+		{
+			zone->wait_table = (wait_queue_head_t *)
+				alloc_bootmem_node(pgdat, alloc_size);
+		}
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
@@ -2824,8 +2833,15 @@
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
+		if (!map) {
+		if(power_managed_node(pgdat->node_id)) {
+			int nid;
+
+			nid = find_closest_non_pm_node(pgdat->node_id);
+			map = alloc_bootmem_node(NODE_DATA(nid), size);
+		} else	
 			map = alloc_bootmem_node(pgdat, size);
+		}
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 		printk(KERN_DEBUG
 			"Node %d memmap at 0x%p size %lu first pfn 0x%p\n",
diff -urN -X linux-2.6.20-mm2/Documentation/dontdiff
linux-2.6.20-mm2/mm/slab.c linux-2.6.20-mm2-monroe/mm/slab.c
--- linux-2.6.20-mm2/mm/slab.c	2007-02-23 11:20:40.000000000 -0800
+++ linux-2.6.20-mm2-monroe/mm/slab.c	2007-03-02 15:15:53.000000000 -0800
@@ -3378,6 +3378,7 @@
  *
  * Fallback to other node is possible if __GFP_THISNODE is not set.
  */
+
 static __always_inline void *
 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 		   void *caller)
@@ -3391,6 +3392,9 @@
 	if (unlikely(nodeid == -1))
 		nodeid = numa_node_id();
 
+	if (power_managed_node(nodeid) )
+		nodeid = find_closest_non_pm_node(nodeid);
+
 	if (unlikely(!cachep->nodelists[nodeid])) {
 		/* Node not bootstrapped yet */
 		ptr = fallback_alloc(cachep, flags);
@@ -3664,6 +3668,8 @@
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int
nodeid)
 {
+	if (power_managed_node(nodeid) )
+			nodeid = find_closest_non_pm_node(nodeid);
 	return __cache_alloc_node(cachep, flags, nodeid,
 			__builtin_return_address(0));
 }
diff -urN -X linux-2.6.20-mm2/Documentation/dontdiff
linux-2.6.20-mm2/mm/sparse.c linux-2.6.20-mm2-monroe/mm/sparse.c
--- linux-2.6.20-mm2/mm/sparse.c	2007-02-04 10:44:54.000000000 -0800
+++ linux-2.6.20-mm2-monroe/mm/sparse.c	2007-03-02 15:15:53.000000000 -0800
@@ -49,6 +49,8 @@
 	struct mem_section *section = NULL;
 	unsigned long array_size = SECTIONS_PER_ROOT *
 				   sizeof(struct mem_section);
+	if (power_managed_node(nid))
+		nid = find_closest_non_pm_node(nid);
 
 	if (slab_is_available())
 		section = kmalloc_node(array_size, GFP_KERNEL, nid);
@@ -215,6 +217,9 @@
 	struct mem_section *ms = __nr_to_section(pnum);
 	int nid = sparse_early_nid(ms);
 
+	if (power_managed_node(nid))
+		nid = find_closest_non_pm_node(nid);
+
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
 		return map;
 
CD: 3ms