linux|一文搞明白Linux内核《物理内存模型》服务器|网络协议|数据结构|运

一、体系结构与内存模型 1、体系结构

目前多处理器系统当中，有两种体系结构：

非一致内存访问（NUMA），指内存划分成多个内存节点的多处理器系统，访问一个内存节点花费的时候取决于处理和内存节点的距离。NUMA是中高端服务器的主流体系结构。
对称多处理器（SMP），即一致内存访问（UMA），所有处理器访问内存花费的时间是相同。每个处理器的地位是平等的，仅在内核初始化的时候不平等：“0号处理器作为引导处理器负责初始化内核，其他处理器等待内核初始化完成。”

在实际应用中可以采用混合体系结构，在NUMA节点内部使用SMP体系结构。

【linux|一文搞明白Linux内核《物理内存模型》】
2、内存模型

从处理器角度看到的物理内存分布，内核管理不同内存模型的方式存在差异。内存管理子系统当中有3种内存模型：

平坦内存（Flat Memory）：内存的物理地址空间是连续的，没有空洞。
不连续内存（Discontiguous Memory）：内存的物理地址空间存在空洞，这种模型可以高效地处理空洞。
稀疏内存（Sparse Memory）：内存的物理地址空间存在空洞，如果需要支持内存热插拔，只能选择稀疏内存模型。

二、三级结构（Node/Zone/Page）

从内存管理子系统使用节点（node）、区域（zone）和页（page）三级结构描述物理内存。

资料直通车：Linux内核源码技术学习路线+视频教程内核源码
学习直通车：Linux内核源码内存调优文件系统进程管理设备驱动/网络协议栈

1、内存节点

NUMA系统的内存节点，根据处理器和内存的距离划分；在具有不连续内存的UMA系统中，表示比区域的级别理高的内存区域，根据物理地址是否连续划分，每块物理地址连续的内存是一个内存节点。内存节点使用一个pglist_data结构体数据类型描述内存布局。

文章图片

成员node_mem_map指向页描述符数组，每个物理页对应一个页描述符。node_mem_map可能不是指向数组的第一个元素，因为页描述符数组的大小必须对齐到2的(MAX_ORDER-1)次方。(MAX_ORDER-1)是页分配器可分配的最大阶数。具体pglist_ddata对应内核源码分析如下：

typedef struct pglist_data {struct zone node_zones[MAX_NR_ZONES]; // 内存区域数组struct zonelist node_zonelists[MAX_ZONELISTS]; // 备用区域列表int nr_zones; // 内存区域数量#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */struct page *node_mem_map; // 页描述符数组，除了稀疏内存模型以外#ifdef CONFIG_PAGE_EXTENSIONstruct page_ext *node_page_ext; // 页的扩展属性#endif#endif#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)/** Must be held any time you expect node_start_pfn,* node_present_pages, node_spanned_pages or nr_zones to stay constant.** pgdat_resize_lock() and pgdat_resize_unlock() are provided to* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.** Nests above zone->lock and zone->span_seqlock*/spinlock_t node_size_lock; #endifunsigned long node_start_pfn; // 起始物理页号unsigned long node_present_pages; // 物理页总数（不包括空洞）unsigned long node_spanned_pages; // 物理页总数（包括空洞）int node_id; // 节点标识符wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected bymem_hotplug_begin/end() */int kswapd_order; enum zone_type kswapd_classzone_idx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */#ifdef CONFIG_COMPACTIONint kcompactd_max_order; enum zone_type kcompactd_classzone_idx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif/** This is a per-node reserve of pages that are not available* to userspace allocations.*/unsigned long totalreserve_pages; #ifdef CONFIG_NUMA/** zone reclaim becomes active if more unmapped pages exist.*/unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA *//* Write-intensive fields used by page reclaim */ZONE_PADDING(_pad1_)spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT/** If memory initialisation on large machines is deferred then this* is the first PFN that needs to be initialised.*/unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */#ifdef CONFIG_TRANSPARENT_HUGEPAGEspinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; #endif/* Fields commonly accessed by the page reclaim scanner */struct lruvec lruvec; unsigned long flags; ZONE_PADDING(_pad2_)/* Per-node vmstats */struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; } pg_data_t; 2、内存区域内存节点被划分为内存区域，内核定义区域类型如下所述：enum zone_type {#ifdef CONFIG_ZONE_DMA/** ZONE_DMA is used when there are devices that are not able* to do DMA to all of addressable memory (ZONE_NORMAL). Then we* carve out the portion of memory that is needed for these devices.* The range is arch specific.** Some examples** Architecture Limit* ---------------------------* parisc, ia64, sparc <4G* s390, powerpc <2G* arm Various* alpha Unlimited or 0-16MB.** i386, x86_64 and multiple other arches* <16M.*//*DMA区域-->直接内存访问*/ZONE_DMA,#endif#ifdef CONFIG_ZONE_DMA32/** x86_64 needs two ZONE_DMAs because it supports devices that are* only able to do DMA to the lower 16M but also 32 bit devices that* can only do DMA areas below 4G.*/ZONE_DMA32, // 64位系统#endif/** Normal addressable memory is in ZONE_NORMAL. DMA operations can be* performed on pages in ZONE_NORMAL if the DMA devices support* transfers to all addressable memory.*/ZONE_NORMAL, // 普通区域 -->线性映射区域（ARM处理器需要使用页表映射，MIPS处理器不需要使用页表映射）#ifdef CONFIG_HIGHMEM/** A memory area that is only addressable by the kernel through* mapping portions into its own address space. This is for example* used by i386 to allow the kernel to address the memory beyond* 900MB. The kernel will set up special mappings (page* table entries on i386) for each page that the kernel needs to* access.*/ZONE_HIGHMEM, // 高端内存区域。64位系统的内核虚拟地址空间非常大，不再需要庙内存区域#endifZONE_MOVABLE, // 可移动区域：伪内存区域，用来防止内存碎片#ifdef CONFIG_ZONE_DEVICEZONE_DEVICE, // 为支持持久内存（热插拔增加的内存区域）#endif__MAX_NR_ZONES}; 每一个内存区域用一个zone结构体描述，对应内核源码如下：struct zone {/* Read-mostly fields *//* zone watermarks, access with *_wmark_pages(zone) macros */unsigned long _watermark[NR_WMARK]; // 页分配器使用的水线unsigned long watermark_boost; unsigned long nr_reserved_highatomic; /** We don't know if the memory that we're going to allocate will be* freeable or/and it will be released eventually, so to avoid totally* wasting several GB of ram we must reserve some of the lower zone* memory (otherwise we risk to run OOM on the lower zones despite* there being tons of freeable ram on the higher zones). This array is* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl* changes.*/long lowmem_reserve[MAX_NR_ZONES]; // 页分配器使用，当前区域保留多少页不能借给高的区域类型#ifdef CONFIG_NUMAint node; #endifstruct pglist_data *zone_pgdat; // 指向内存节点的pglist_data实例struct per_cpu_pageset __percpu *pageset; // 每处理器页集合#ifndef CONFIG_SPARSEMEM/** Flags for a pageblock_nr_pages block. See pageblock-flags.h.* In SPARSEMEM, this map is stored in struct mem_section*/unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM *//* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */unsigned long zone_start_pfn; /** spanned_pages is the total pages spanned by the zone, including* holes, which is calculated as:* spanned_pages = zone_end_pfn - zone_start_pfn; ** present_pages is physical pages existing within the zone, which* is calculated as:* present_pages = spanned_pages - absent_pages(pages in holes); ** managed_pages is present pages managed by the buddy system, which* is calculated as (reserved_pages includes pages allocated by the* bootmem allocator):* managed_pages = present_pages - reserved_pages; ** So present_pages may be used by memory hotplug or memory power* management logic to figure out unmanaged pages by checking* (present_pages - managed_pages). And managed_pages should be used* by page allocator and vm scanner to calculate all kinds of watermarks* and thresholds.** Locking rules:** zone_start_pfn and spanned_pages are protected by span_seqlock.* It is a seqlock because it has to be read outside of zone->lock,* and it is done in the main allocator path. But, it is written* quite infrequently.** The span_seq lock is declared along with zone->lock because it is* frequently read in proximity to zone->lock. It's good to* give them a chance of being in the same cacheline.** Write access to present_pages at runtime should be protected by* mem_hotplug_begin/end(). Any reader who can't tolerant drift of* present_pages should get_online_mems() to get a stable value.*/atomic_long_t managed_pages; // 伙伴分配器管理的物理页的数量unsigned long spanned_pages; // 当前区域跨越的总页数，包括空洞unsigned long present_pages; // 当前区域存在的物理页的数量，不我包括空洞const char *name; // 区域名称#ifdef CONFIG_MEMORY_ISOLATION/** Number of isolated pageblock. It is used to solve incorrect* freepage counting problem due to racy retrieving migratetype* of pageblock. Protected by zone->lock.*/unsigned long nr_isolate_pageblock; #endif#ifdef CONFIG_MEMORY_HOTPLUG/* see spanned/present_pages for more description */seqlock_t span_seqlock; #endifint initialized; /* Write-intensive fields used from the page allocator */ZONE_PADDING(_pad1_)/* 不同长度的空闲区域 */struct free_area free_area[MAX_ORDER]; /* zone flags, see below */unsigned long flags; /* Primarily protects free_area */spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ZONE_PADDING(_pad2_)/** When free pages are below this point, additional steps are taken* when reading the number of free pages to avoid per-cpu counter* drift allowing watermarks to be breached*/unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA/* pfn where compaction free scanner should start */unsigned long compact_cached_free_pfn; /* pfn where async and sync compaction migration scanner should start */unsigned long compact_cached_migrate_pfn[2]; #endif#ifdef CONFIG_COMPACTION/** On compaction failure, 1<* are skipped before trying again. The number attempted since* last failure is tracked with compact_considered.*/unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif#if defined CONFIG_COMPACTION || defined CONFIG_CMA/* Set to true when the PG_migrate_skip bits should be cleared */bool compact_blockskip_flush; #endifbool contiguous; ZONE_PADDING(_pad3_)/* Zone statistics */atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; 3、物理页每个物理页对应一个page结构体，称为页描述符，内存节点的pglist_data实例的成员node_mem_map指向该内存节点包含的所有物理页的页描述符组成的数组。在内核里面：内核函数page_to_nid用来得到物理内存所属的内存节点的编号源码如下：#ifdef NODE_NOT_IN_PAGE_FLAGSextern int page_to_nid(const struct page *page); #elsestatic inline int page_to_nid(const struct page *page){struct page *p = (struct page *)page; return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; }#endifpage_zonenum用来得到物理页所属的内存区域类型：static inline enum zone_type page_zonenum(const struct page *page){return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; }

三、Bootmem/Memblock分配器

在Linux内核初始化的时候需要分配内存，内核提供临时的引导内存分配器，在页分配器和块分配器初始化完成之后，把空闲的物理页交给页分配器管理，丢弃引导内存分配器。

1、bootmem分配器应用的数据结构源码如下：

// 其中下面这个结构体中成员node_bootmem_map，指向一个位图，// 每个物理页对应一位，如果物理页被分配，把对应的位设置为1struct bootmem_data;

在老版本里面有bootmem_data此结构体。新版本只有memblock结构体。

2、memblock分配器应用的数据结构如下：

/*** struct memblock_type - collection of memory regions of certain type* @cnt: number of regions* @max: size of the allocated array* @total_size: size of all regions* @regions: array of regions* @name: the memory type symbolic name*/// 内存块类型的数据结构struct memblock_type {unsigned long cnt; // 区域数量unsigned long max; // 已分配数组的大小phys_addr_t total_size; // 所有区域的长度struct memblock_region *regions; // 内存块区域数组char *name; // 内存块类型的名称}; /*** struct memblock - memblock allocator metadata* @bottom_up: is bottom up direction?* @current_limit: physical address of the current allocation limit* @memory: usabe memory regions* @reserved: reserved memory regions* @physmem: all physical memory*/struct memblock {bool bottom_up; // 表示分配方式，值为真表示从低地址向上分配，为假表示从高地址向下分配phys_addr_t current_limit; // 可分配内存的最大物理地址struct memblock_type memory; // 内存类型（已分配内存和未分配内存）struct memblock_type reserved; // 保存类型#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAPstruct memblock_type physmem; // 物理内存类型#endif}; 物理内存类型和内存类型区别：内存类型是物理内存类型的子集，在引用内核时可以使用内核参数，把定可用内存的大小。物理内存类型总是包含所有内存范围（可用内存范围）。/*** enum memblock_flags - definition of memory region attributes* @MEMBLOCK_NONE: no special request* @MEMBLOCK_HOTPLUG: hotpluggable region* @MEMBLOCK_MIRROR: mirrored region* @MEMBLOCK_NOMAP: don't add to kernel direct mapping*/enum memblock_flags {MEMBLOCK_NONE = 0x0, /* 没有特殊要求的区域 */MEMBLOCK_HOTPLUG = 0x1, /* 可热插拔区域 */MEMBLOCK_MIRROR = 0x2, /* 镜像区域 */MEMBLOCK_NOMAP = 0x4, /* 不添加到内核直接映射（线性映射区域） */}; /*** struct memblock_region - represents a memory region* @base: physical address of the region* @size: size of the region* @flags: memory region attributes* @nid: NUMA node id*/// 内存块区域数据结构如下：struct memblock_region {phys_addr_t base; // 起始物理地址phys_addr_t size; // 长度enum memblock_flags flags; // 标志#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAPint nid; // 节点编号#endif}; ARM64内核初始化memblock分配器过程，具体内核源码分析如下：a.解析设备树二进制文件中节点/memory，把所有物理内存范围添加到memblock.memory。b.直接在内核函数arm64_memblock_init初始化memblock。void __init arm64_memblock_init(void){const s64 linear_region_size = -(s64)PAGE_OFFSET; /* Handle linux,usable-memory-range property */fdt_enforce_memory_region(); /* Remove memory above our supported physical address size */memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX); /** Ensure that the linear region takes up exactly half of the kernel* virtual address space. This way, we can distinguish a linear address* from a kernel/module/vmalloc address by testing a single bit.*/BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1)); /** Select a suitable value for the base of physical memory.*/memstart_addr = round_down(memblock_start_of_DRAM(),ARM64_MEMSTART_ALIGN); /** Remove the memory that we will not be able to cover with the* linear mapping. Take care not to clip the kernel which may be* high in memory.*/memblock_remove(max_t(u64, memstart_addr + linear_region_size,__pa_symbol(_end)), ULLONG_MAX); if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {/* ensure that memstart_addr remains sufficiently aligned */memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,ARM64_MEMSTART_ALIGN); memblock_remove(0, memstart_addr); }/** Apply the memory limit if it was set. Since the kernel may be loaded* high up in memory, add back the kernel region that must be accessible* via the linear mapping.*/if (memory_limit != PHYS_ADDR_MAX) {memblock_mem_limit_remove_map(memory_limit); memblock_add(__pa_symbol(_text), (u64)(_end - _text)); }if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {/** Add back the memory we just removed if it results in the* initrd to become inaccessible via the linear mapping.* Otherwise, this is a no-op*/u64 base = phys_initrd_start & PAGE_MASK; u64 size = PAGE_ALIGN(phys_initrd_size); /** We can only add back the initrd memory if we don't end up* with more memory than we can address via the linear mapping.* It is up to the bootloader to position the kernel and the* initrd reasonably close to each other (i.e., within 32 GB of* each other) so that all granule/#levels combinations can* always access both.*/if (WARN(base < memblock_start_of_DRAM() ||base + size > memblock_start_of_DRAM() +linear_region_size,"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {initrd_start = 0; } else {memblock_remove(base, size); /* clear MEMBLOCK_ flags */memblock_add(base, size); memblock_reserve(base, size); }}if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {extern u16 memstart_offset_seed; u64 range = linear_region_size -(memblock_end_of_DRAM() - memblock_start_of_DRAM()); /** If the size of the linear region exceeds, by a sufficient* margin, the size of the region that the available physical* memory spans, randomize the linear region as well.*/if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {range /= ARM64_MEMSTART_ALIGN; memstart_addr -= ARM64_MEMSTART_ALIGN *((range * memstart_offset_seed) >> 16); }}/** Register the kernel text, kernel data, initrd, and initial* pagetables with memblock.*/memblock_reserve(__pa_symbol(_text), _end - _text); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {/* the generic initrd code expects virtual addresses */initrd_start = __phys_to_virt(phys_initrd_start); initrd_end = initrd_start + phys_initrd_size; }early_init_fdt_scan_reserved_mem(); /* 4GB maximum for 32-bit only capable devices */if (IS_ENABLED(CONFIG_ZONE_DMA32))arm64_dma_phys_limit = max_zone_dma_phys(); elsearm64_dma_phys_limit = PHYS_MASK + 1; reserve_crashkernel(); reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; dma_contiguous_reserve(arm64_dma_phys_limit); }