Fwd: ext2/3 create large filesystem takes too much time; solutions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



---------- Forwarded message ----------
From: Pavel Mironchik <[email protected]>
Date: Sep 12, 2006 2:07 PM
Subject: ext2/3 create large filesystem takes too much time; solutions
To: [email protected]


Hi,

Ext2/3 does erase of inode tables, when do creation of new systems.
This is very very long operation when the target file system volume is more than
2Tb. Other filesystem are not affected by such huge delay on creation of
filesystem. My concern was to improve design of ext3 to decrease time
consuption of creation large ext3 volumes on storage servers.
In general to solve problem, we should defer job of cleaning nodes to
kernel. In e2fsprogs there is LAZY_BG options but it  just avoids doing
erase of inodes only.

I see several solutions for that problem:
1) Add special bitmaps into fs header (inode groups descriptors?).
By looking at those bitmaps kernel could determine if inode is not cleaned, and
that inode will be propertly initialized.
2) Add special identifiers into inodes. If super block id != inode id
-> inode is dirty
and should be cleaned in kernel, where super block id is generated on
creation stage.

I choosed second (much easier - just few lines of code)
and implemented patch for e2fsprogs, kernel ext3. It is just proof of a concept.
With the help of this patch I could create  terrabytes volumes fast.
Of cource this patch will broke compatibility for existing filesystem.
More correctly is to choose first way and do not broke compatibility.

Writing this mail, I just want check if there is any interest for this problem
from community.
I would like to see that future ext4 filesystem will be created fast.
and I would be appreciate for  thoughts, remarks.
---------------------------
Pavel S. Mironchik




--
-----------------------------------
Pavel S. Mironchik
diff -ruN upload/fs/ext3/inode.c l1/linux-2.6.17/fs/ext3/inode.c
--- upload/fs/ext3/inode.c	2006-08-24 00:16:33 +0300
+++ linux-2.6.17/fs/ext3/inode.c	2006-09-06 16:22:17 +0300
@@ -2576,6 +2576,17 @@
 		inode->i_flags |= S_DIRSYNC;
 }
 
+void ext3_check_uuid_inode(struct ext3_inode *rinode, struct inode *vinode) 
+{
+  struct ext3_sb_info* sbi = (struct ext3_sb_info*)(vinode->i_sb)->s_fs_info;
+  int length = sizeof(u8)*16;
+
+  if(memcmp(rinode->i_uuid,sbi->s_uuid, length)) {
+    memset(rinode,0,sizeof(struct ext3_inode));
+    memcpy(rinode->i_uuid,sbi->s_uuid, length);
+  }
+}
+
 void ext3_read_inode(struct inode * inode)
 {
 	struct ext3_iloc iloc;
@@ -2594,6 +2605,9 @@
 		goto bad_inode;
 	bh = iloc.bh;
 	raw_inode = ext3_raw_inode(&iloc);
+	
+	ext3_check_uuid_inode(raw_inode,inode);
+
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -2713,6 +2727,8 @@
 	return;
 }
 
+
+
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
@@ -2725,6 +2741,7 @@
 				struct ext3_iloc *iloc)
 {
 	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
+	struct ext3_sb_info* sbi = (struct ext3_sb_info*)(inode->i_sb)->s_fs_info;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
@@ -2773,6 +2790,8 @@
 	raw_inode->i_fsize = ei->i_frag_size;
 #endif
 	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+	memcpy(raw_inode->i_uuid, sbi->s_uuid, sizeof(u8)*16);
+
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
 	} else {
@@ -2832,7 +2851,7 @@
 }
 
 /*
- * ext3_write_inode()
+ * ext3_write_inode
  *
  * We are called from a few places:
  *
diff -ruN upload/fs/ext3/super.c l1/linux-2.6.17/fs/ext3/super.c
--- upload/fs/ext3/super.c	2006-08-24 00:16:33 +0300
+++ linux-2.6.17/fs/ext3/super.c	2006-09-06 16:03:25 +0300
@@ -1464,6 +1425,8 @@
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
 
+	memcpy(sbi->s_uuid,es->s_uuid, sizeof(u8)*16);
+
 	set_opt(sbi->s_mount_opt, RESERVATION);
 
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
diff -ruN upload/include/linux/ext3_fs.h l1/linux-2.6.17/include/linux/ext3_fs.h
--- upload/include/linux/ext3_fs.h	2006-08-24 00:16:33 +0300
+++ l1/linux-2.6.17/include/linux/ext3_fs.h	2006-09-06 16:20:53 +0300
@@ -284,6 +284,7 @@
 	__le32	i_file_acl;	/* File ACL */
 	__le32	i_dir_acl;	/* Directory ACL */
 	__le32	i_faddr;	/* Fragment address */
+        __le16   i_uuid[8];
 	union {
 		struct {
 			__u8	l_i_frag;	/* Fragment number */
@@ -531,7 +532,7 @@
 #define EXT3_CURRENT_REV	EXT3_GOOD_OLD_REV
 #define EXT3_MAX_SUPP_REV	EXT3_DYNAMIC_REV
 
-#define EXT3_GOOD_OLD_INODE_SIZE 128
+#define EXT3_GOOD_OLD_INODE_SIZE (128+(8*16))
 
 /*
  * Feature set definitions
diff -ruN upload/include/linux/ext3_fs_sb.h l1/linux-2.6.17/include/linux/ext3_fs_sb.h
--- upload/include/linux/ext3_fs_sb.h	2006-08-24 00:16:33 +0300
+++ l1/linux-2.6.17/include/linux/ext3_fs_sb.h	2006-09-06 15:59:35 +0300
@@ -78,6 +78,7 @@
 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
+        u8 s_uuid[16]; 
 };
 
 #endif	/* _LINUX_EXT3_FS_SB */
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 1cfbc35..7008826 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -296,7 +296,8 @@ struct ext2_inode {
 	__u32	i_generation;	/* File version (for NFS) */
 	__u32	i_file_acl;	/* File ACL */
 	__u32	i_dir_acl;	/* Directory ACL */
-	__u32	i_faddr;	/* Fragment address */
+	__u32	i_faddr;	/* Fragment address */        
+        __u8    s_uuid[16];
 	union {
 		struct {
 			__u8	l_i_frag;	/* Fragment number */
@@ -354,6 +355,7 @@ struct ext2_inode_large {
 	__u32	i_file_acl;	/* File ACL */
 	__u32	i_dir_acl;	/* Directory ACL */
 	__u32	i_faddr;	/* Fragment address */
+        __u8    s_uuid[16];
 	union {
 		struct {
 			__u8	l_i_frag;	/* Fragment number */
@@ -547,7 +549,7 @@ #define EXT2_DYNAMIC_REV	1	/* V2 format 
 #define EXT2_CURRENT_REV	EXT2_GOOD_OLD_REV
 #define EXT2_MAX_SUPP_REV	EXT2_DYNAMIC_REV
 
-#define EXT2_GOOD_OLD_INODE_SIZE 128
+#define EXT2_GOOD_OLD_INODE_SIZE (128+(16*8))
 
 /*
  * Journal inode backup types
diff --git a/lib/ext2fs/inode.c b/lib/ext2fs/inode.c
index 8d528b4..af3b87f 100644
--- a/lib/ext2fs/inode.c
+++ b/lib/ext2fs/inode.c
@@ -381,6 +381,8 @@ static inline int is_empty_scan(ext2_ino
 }
 #endif
 
+errcode_t ext2fs_check_inode_uuid(ext2_filsys fs, struct ext2_inode* inode);
+
 errcode_t ext2fs_get_next_inode_full(ext2_inode_scan scan, ext2_ino_t *ino,
 				     struct ext2_inode *inode, int bufsize)
 {
@@ -452,7 +454,7 @@ #endif
 		       scan->inode_size - extra_bytes);
 		scan->ptr += scan->inode_size - extra_bytes;
 		scan->bytes_left -= scan->inode_size - extra_bytes;
-
+		/* CHECK */
 #ifdef EXT2FS_ENABLE_SWAPFS
 		if ((scan->fs->flags & EXT2_FLAG_SWAP_BYTES) ||
 		    (scan->fs->flags & EXT2_FLAG_SWAP_BYTES_READ))
@@ -463,10 +465,12 @@ #ifdef EXT2FS_ENABLE_SWAPFS
 		else
 #endif
 			*inode = *((struct ext2_inode *) scan->temp_buffer);
+		ext2fs_check_inode_uuid(scan->fs,inode);
 		if (scan->scan_flags & EXT2_SF_BAD_EXTRA_BYTES)
 			retval = EXT2_ET_BAD_BLOCK_IN_INODE_TABLE;
 		scan->scan_flags &= ~EXT2_SF_BAD_EXTRA_BYTES;
 	} else {
+	  /* CHECK */
 #ifdef EXT2FS_ENABLE_SWAPFS
 		if ((scan->fs->flags & EXT2_FLAG_SWAP_BYTES) ||
 		    (scan->fs->flags & EXT2_FLAG_SWAP_BYTES_READ))
@@ -477,6 +481,7 @@ #ifdef EXT2FS_ENABLE_SWAPFS
 		else
 #endif
 			memcpy(inode, scan->ptr, bufsize);
+		ext2fs_check_inode_uuid(scan->fs,inode);
 		scan->ptr += scan->inode_size;
 		scan->bytes_left -= scan->inode_size;
 		if (scan->scan_flags & EXT2_SF_BAD_INODE_BLK)
@@ -496,6 +501,17 @@ errcode_t ext2fs_get_next_inode(ext2_ino
 						sizeof(struct ext2_inode));
 }
 
+errcode_t ext2fs_update_inode_uuid(ext2_filsys fs, struct ext2_inode* inode);
+
+errcode_t ext2fs_check_inode_uuid(ext2_filsys fs, struct ext2_inode* inode)  
+{
+  if(memcmp(inode->s_uuid,fs->super->s_uuid,sizeof(__u8)*16)) {
+    memset(inode, 0, sizeof(struct ext2_inode));
+    ext2fs_update_inode_uuid(fs, inode);
+  }
+  return 0;
+}
+
 /*
  * Functions to read and write a single inode.
  */
@@ -589,6 +605,9 @@ #ifdef EXT2FS_ENABLE_SWAPFS
 				       0, length);
 #endif
 
+
+	ext2fs_check_inode_uuid(fs,inode);
+	
 	/* Update the inode cache */
 	fs->icache->cache_last = (fs->icache->cache_last + 1) %
 		fs->icache->cache_size;
@@ -605,6 +624,18 @@ errcode_t ext2fs_read_inode(ext2_filsys 
 					sizeof(struct ext2_inode));
 }
 
+
+/* 
+   NODE CONNECTIVITY!!!
+ */
+errcode_t ext2fs_update_inode_uuid(ext2_filsys fs, struct ext2_inode* inode) 
+{
+  
+         memcpy(inode->s_uuid,fs->super->s_uuid, sizeof(__u8) * 16);
+         return 0;
+}
+
+
 errcode_t ext2fs_write_inode_full(ext2_filsys fs, ext2_ino_t ino,
 				  struct ext2_inode * inode, int bufsize)
 {
@@ -615,6 +646,8 @@ errcode_t ext2fs_write_inode_full(ext2_f
 	int clen, i, length;
 
 	EXT2_CHECK_MAGIC(fs, EXT2_ET_MAGIC_EXT2FS_FILSYS);
+	
+	ext2fs_update_inode_uuid(fs,inode);
 
 	/* Check to see if user provided an override function */
 	if (fs->write_inode) {
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index d008e15..e786938 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -398,7 +398,7 @@ static errcode_t zero_blocks(ext2_filsys
 
 static void write_inode_tables(ext2_filsys fs)
 {
-	errcode_t	retval;
+	errcode_t	retval = 0;
 	blk_t		blk;
 	dgrp_t		i;
 	int		num;
@@ -423,7 +423,7 @@ static void write_inode_tables(ext2_fils
 
 		if (!(lazy_flag &&
 		      (fs->group_desc[i].bg_flags & EXT2_BG_INODE_UNINIT))) {
-			retval = zero_blocks(fs, blk, num, 0, &blk, &num);
+		//	retval = zero_blocks(fs, blk, num, 0, &blk, &num);
 			if (retval) {
 				fprintf(stderr, _("\nCould not write %d "
 				"blocks in inode table starting at %u: %s\n"),

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux