A Process(Thread) in Linux Kernel:

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * For reasons of header soup (see current_thread_info()), this
	 * must be the first element of task_struct.
	 */
	struct thread_info		thread_info;
#endif
	struct mm_struct		*mm;
	...
	pid_t				pid;
	...
	/*
	 * Pointers to the (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with
	 * p->real_parent->pid)
	 */
 
	/* Real parent process: */
	struct task_struct __rcu	*real_parent;
 
	/* Recipient of SIGCHLD, wait4() reports: */
	struct task_struct __rcu	*parent;
 
	/*
	 * Children/sibling form the list of natural children:
	 */
	struct list_head		children;
	struct list_head		sibling;
	struct task_struct		*group_leader;
	...
	/* Filesystem information: */
	struct fs_struct		*fs;
 
	/* Open file information: */
	struct files_struct		*files;
	...
	...
	/* Namespaces: */
	struct nsproxy			*nsproxy;
	...
	...
	/* Control Group info protected by css_set_lock: */
	struct css_set __rcu		*cgroups;
	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
	struct list_head		cg_list;
	...
	/* CPU-specific state of this task: */
	struct thread_struct		thread;
};

Memory (mm_struct)

image source

image source

Before v6.1-rc1:

VMA(Virtual Memory Area)也用紅黑樹來紀錄追蹤頁面 (page) 變更,因為後者不免存在頻繁的讀取 VMA 結構,如 page fault 和 mmap 等操作,且當大量的已對應 (mapped) 區域時存在時,若要尋找某個特定的虛擬記憶體地址,鏈結串列 (linked list) 的走訪成本過高,因此需要一種資料結構以提供更有效率的尋找,於是紅黑樹就可勝任。

Linux 核心的紅黑樹

vm_area_struct is a doubly linked list, and rb_root is red black tree for faster query of a particular virtual memory address.

pgd is the address of the global page table.

image source

  • start_codeend_code:starting and ending memory address of the code section.
  • start_data、 end_data:starting and ending memory address of the data section.
  • start_stack:starting memory address of the stack section.
  • start_brk 、 brk :starting and ending memory address of the heap section.
  • arg_startarg_end:starting and ending memory address of the function arguments.
  • env_startenv_end :starting and ending memory address of the environmental arguments.
struct mm_struct {
	struct {
		struct vm_area_struct *mmap;
		struct rb_root mm_rb;
		...
		pgd_t * pgd;
		...
		unsigned long start_code, end_code, start_data, end_data;
		unsigned long start_brk, brk, start_stack;
		unsigned long arg_start, arg_end, env_start, env_end;
};
struct vm_area_struct {
	/* The first cache line has the info for VMA tree walking. */
 
	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */
 
	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next, *vm_prev;
	...
	/*
	 * Access permissions of this VMA.
	 * See vmf_insert_mixed_prot() for discussion.
	 */
	pgprot_t vm_page_prot;
	unsigned long vm_flags;		/* Flags, see mm.h. */
 

Since v6.1-rc1:

struct mm_struct {
	struct {
		struct maple_tree mm_mt;
		...
		pgd_t * pgd;
		...
		unsigned long start_code, end_code, start_data, end_data;
		unsigned long start_brk, brk, start_stack;
		unsigned long arg_start, arg_end, env_start, env_end;
};
/*
 * If the tree contains a single entry at index 0, it is usually stored in
 * tree->ma_root.  To optimise for the page cache, an entry which ends in '00',
 * '01' or '11' is stored in the root, but an entry which ends in '10' will be
 * stored in a node.  Bits 3-6 are used to store enum maple_type.
 *
 * The flags are used both to store some immutable information about this tree
 * (set at tree creation time) and dynamic information set under the spinlock.
 *
 * Another use of flags are to indicate global states of the tree.  This is the
 * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
 * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
 * re-allocating and RCU freeing nodes when there is a single user.
 */
struct maple_tree {
	union {
		spinlock_t	ma_lock;
		lockdep_map_p	ma_external_lock;
	};
	unsigned int	ma_flags;
	void __rcu      *ma_root;
};
 
/*
 * This struct describes a virtual memory area. There is one of these
 * per VM-area/task. A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
	/* The first cache line has the info for VMA tree walking. */
 
	union {
		struct {
			/* VMA covers [vm_start; vm_end) addresses within mm */
			unsigned long vm_start;
			unsigned long vm_end;
		};
#ifdef CONFIG_PER_VMA_LOCK
		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
#endif
	};

Per Thread’s Stack

sp holds the start address of the stack pointer of the thread’s stack.

struct thread_struct {
	/* Cached TLS descriptors: */
	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
#ifdef CONFIG_X86_32
	unsigned long		sp0;
#endif
	unsigned long		sp;
	...

x86 中的 thread_struct

Process V.S. Thread

如果產生出來的 task_struct 有自己的 mm_struct,稱為 Process,其實本身就是個 Main Thread (CPU scheduling的基本單位)。如果 產生出來的 task_struct 共用別人的 mm_struct,稱為 Thread。

The Linux kernel does not provide any special scheduling semantics of data structures to represent threads. Instead, a thread is merely a process that shares certain resources with other processes.

若一個 task_structmm_struct = NULL,代表這個 task_struct 是一個 kernel thread。

Files

image source

一個 process(task_struct) 有以下兩個 field:

/* Filesystem information: */
struct fs_struct		*fs;
 
/* Open file information: */
struct files_struct		*files;

其中,fs_struct 代表這個 process 目前所使用的 file system 的 information,其中,每一個 path 都具有路徑(dentry)以及 mount 的相關資訊,包括 mount 在哪裡(dentry)以及這個 mounted 的 file system 的資訊(super_block)。

struct fs_struct {
	...
	struct path root, pwd;
} __randomize_layout;

root path 和目前的 working directory struct fs_struct

struct path {
	struct vfsmount *mnt;
	struct dentry *dentry;
} __randomize_layout;
struct vfsmount {
	struct dentry *mnt_root;	/* root of the mounted tree */
	struct super_block *mnt_sb;	/* pointer to superblock */
	...
} __randomize_layout;

dentry 代表路徑(directory),具有對應的 inode 的資訊。

source

struct dentry {
	...
	struct dentry *d_parent;	/* parent directory */
	...
	struct inode *d_inode;		/* Where the name belongs to - NULL is
					 * negative */
	...
	const struct dentry_operations *d_op;
	struct super_block *d_sb;	/* The root of the dentry tree */
	...
};
 

super_block is the control block of the one specific file system. 可觀察到其具有一些屬性,包括 dev_t,也就是 device 的編號。

struct super_block {
	...
	dev_t			s_dev;		/* search index; _not_ kdev_t */
	...
	loff_t			s_maxbytes;	/* Max file size */
	struct file_system_type	*s_type;
	const struct super_operations	*s_op;
	...
};
struct super_operations {
   	struct inode *(*alloc_inode)(struct super_block *sb);
	void (*destroy_inode)(struct inode *);
	void (*free_inode)(struct inode *);
 
   	void (*dirty_inode) (struct inode *, int flags);
	int (*write_inode) (struct inode *, struct writeback_control *wbc);
	int (*drop_inode) (struct inode *);
	...
};

inode :inode (index node)是指在許多「類Unix檔案系統」中的一種資料結構,用於描述檔案系統對象(包括檔案、目錄、裝置檔案、socket、管道等)。每個inode儲存了檔案系統對象資料的屬性和磁碟塊位置。檔案系統對象屬性包含了各種元資料(如:最後修改時間) ,也包含使用者群組(owner)和權限資料。

source

struct inode {
	...
	const struct inode_operations	*i_op;
	struct super_block	*i_sb;
	...
};
struct inode_operations {
	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
	int (*permission) (struct mnt_idmap *, struct inode *, int);
	struct posix_acl * (*get_inode_acl)(struct inode *, int, bool);
 
	int (*readlink) (struct dentry *, char __user *,int);
 
	int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,
		       umode_t, bool);
	int (*link) (struct dentry *,struct inode *,struct dentry *);
	int (*unlink) (struct inode *,struct dentry *);
	int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,
	...
};
		

至於一個 process 的 open files,使用 files_struct 表示。

/*
 * Open file table structure
 */
struct files_struct {
	...
	struct fdtable __rcu *fdt;
	struct fdtable fdtab;
	...
	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct files_struct

struct fdtable {
	unsigned int max_fds;
	struct file __rcu **fd;      /* current fd array */
	...
};

fdt 為 per process 的 open file table( fdtable, file descriptor table),fdtab 為其備份。

如果一個 process 打開的 file 數目多於 NR_OPEN_DEFAULT (32 bit machine 上是 32個,64 bit 是 64 個),kernel 就會分配一個新的更大的 fd_array,並將其地址存放在fd 欄位中,而這個 fd_array 所包含的 file 數目存放在 max_fds 欄位。

一個 file 會具有其對應的 inode pointer。

struct file {
	...
	struct inode		*f_inode;	/* cached value */
	const struct file_operations	*f_op;
};
struct file_operations {
	struct module *owner;
	loff_t (*llseek) (struct file *, loff_t, int);
	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
	...
	...
	int (*mmap) (struct file *, struct vm_area_struct *);
	...
	int (*open) (struct inode *, struct file *);
	int (*flush) (struct file *, fl_owner_t id);
	int (*release) (struct inode *, struct file *);
	...
};

Namespaces and “containers”

A task_struct contains the following two fields, which is essential for implementing linux containers:

/* Namespaces: */
struct nsproxy			*nsproxy;
...
...
/* Control Group info protected by css_set_lock: */
struct css_set __rcu		*cgroups;
/*
 * A structure to contain pointers to all per-process
 * namespaces - fs (mount), uts, network, sysvipc, etc.
 *
 * The pid namespace is an exception -- it's accessed using
 * task_active_pid_ns.  The pid namespace here is the
 * namespace that children will use.
 *
 * 'count' is the number of tasks holding a reference.
 * The count for each namespace, then, will be the number
 * of nsproxies pointing to it, not the number of tasks.
 *
 * The nsproxy is shared by tasks which share all namespaces.
 * As soon as a single namespace is cloned or unshared, the
 * nsproxy is copied.
 */
struct nsproxy {
	refcount_t count;
	struct uts_namespace *uts_ns;
	struct ipc_namespace *ipc_ns;
	struct mnt_namespace *mnt_ns;
	struct pid_namespace *pid_ns_for_children;
	struct net 	     *net_ns;
	struct time_namespace *time_ns;
	struct time_namespace *time_ns_for_children;
	struct cgroup_namespace *cgroup_ns;
};

see User Namespace 詳解