Hacking PostgreSQL. Физическое представление данных

Embed Size (px)

Citation preview

Postgres Professional presentation

Click to edit the title text formatClick to edit Master title style

www.postgrespro.ru

Click to edit the title text formatClick to edit Master title style

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline LevelSeventh Outline LevelClick to edit Master text stylesSecond levelThird levelFourth level

Fifth level

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline LevelSeventh Outline LevelClick to edit Master text stylesSecond levelThird levelFourth level

Fifth level

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline Level

Seventh Outline LevelClick to edit Master text stylesSecond levelThird levelFourth levelFifth level

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline Level

Seventh Outline LevelClick to edit Master text stylesSecond levelThird levelFourth levelFifth level

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline Level

Seventh Outline LevelClick to edit Master text stylesSecond levelThird levelFourth levelFifth level

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline LevelSeventh Outline Level

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline LevelSeventh Outline Level

Click to edit the title text formatClick to edit Master title style

Click to edit the title text formatClick to edit Master title style

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline LevelSeventh Outline LevelClick to edit Master text stylesSecond levelThird levelFourth level

Fifth level

Click to edit the title text formatClick to edit Master title style

Click to edit the outline text formatSecond Outline LevelThird Outline LevelFourth Outline LevelFifth Outline LevelSixth Outline LevelSeventh Outline LevelClick to edit Master text stylesSecond levelThird levelFourth level

Fifth level

Click to edit the title text formatClick to edit Master title style

www.postgrespro.ru

Hacking PostgreSQL24.03.2016

OID

PGDATA

Storage manager

Heap

Forks

Page

: ?:src/include/access/stratnum.h

Relation

src/include/catalog/pg_class.h#define RELKIND_RELATION 'r' /* ordinary table */#define RELKIND_INDEX 'i' /* secondary index */#define RELKIND_SEQUENCE 'S' /* sequence object */#define RELKIND_TOASTVALUE 't' /* for out-of-line values */#define RELKIND_VIEW 'v' /* view */#define RELKIND_COMPOSITE_TYPE 'c' /* composite type */#define RELKIND_FOREIGN_TABLE 'f' /* foreign table */#define RELKIND_MATVIEW 'm' /* materialized view */

, , , . . pg_class. , - . relation OID.

OID

src/include/postgres_ext.h

/* Object ID is a fundamental type in Postgres. */typedef unsigned int Oid;

#define InvalidOid((Oid) 0)

src/include/access/transam.h

#define FirstBootstrapObjectId10000#define FirstNormalObjectId16384

OID object indentificator, , , PostgreSQL. , , , . enum OID. OID unsigned int, . OID 0 OID. 10 OID. , , src/include/catalog 10 16383 ( ) initdb. OID 16384 OID . Unsigned int , - . wraparound OID, 0 FirstNormalObjectId .

OID

src/backend/catalog/catalog.cOid GetNewOid (Relation relation)

Oid GetNewRelFileNode (Oid reltablespace,Relation pg_class,char relpersistence)

src/backend/access/transam/varsup.cOid GetNewObjectId()

( ), OID. GetNewOid OID, . GetNewOid pg_database. , . relation, OID, , GetNewRelFileNode. , relfilenode . GetNewObjectId, nextOid OID. ? OID , , .

OID

src/include/catalog/unused_oidssrc/include/catalog/duplicate_oids

, pg_proc, OID . unused_oids OID. OID, . duplicate_oids OID. , merge.

SELECT oid, relname FROM pg_class LIMIT 1;

oid | relname ------+-------------- 2619 | pg_statistic

SELECT * FROM pg_attribute WHERE attrelid = 'pg_statistic'::regclass;

SELECT * FROM pg_attribute WHERE attrelid = (SELECT oid FROM pg_class WHERE relname = 'pg_statistic');

SELECT 'pg_statistic'::regclass::oid; oid ------ 2619

SELECT 2619::regclass; regclass -------------- pg_statistic

OID, , select. Select * oid. alias OID. regclass. regclass regproc, regtype , .

OID wraparond?

?

OID?

, ?

OID

[HACKERS] 32bit OID wrap around conceerns

, pg_proc, OID . unused_oids OID. OID, . duplicate_oids OID. , merge.

$PGDATA

PGDATA="/home/anastasia/projects/postgresql_data"ls -CF

base/ pg_multixact/ pg_tblspc/global/ pg_notify/ pg_twophase/pg_clog/ pg_replslot/ PG_VERSIONpg_commit_ts/ pg_serial/ pg_xlog/pg_dynshmem/ pg_snapshots/ postgresql.auto.confpg_hba.conf pg_stat/ postgresql.confpg_ident.conf pg_stat_tmp/ postmaster.optspg_logical/ pg_subtrans/

Postgres ., postgres data. $PGDATA. , . , , . , , , PGDATA., pg_basebackup Debian. , , pg_basebackup - . , postgesql.conf, pgdata ( ), .

$PGDATA/global

ls -CF

1136 1233 2396_vm 2846_vm 40601136_fsm 1260 2397 2847 4060_vm1136_vm 1260_fsm 2671 2964 40611137 1260_vm 2672 2964_vm 60001213 1261 2676 2965 6000_vm1213_fsm 1261_vm 2677 2966 60011213_vm 1262 2694 2966_vm 60021214 1262_fsm 2695 2967 pg_control1214_fsm 1262_vm 2697 3592 pg_filenode.map1214_vm 2396 2698 3592_vm pg_internal.init1232 2396_fsm 2846 3593

global , . :pg_control checkpoint, WAL. pg_global. - , src/include/catalog/pg_control.h

pg_filenode.map pg_class.relfilenode., pg_class . pg_class, relation. pg_filenode.map. src/backend/utils/cache/relmapper.c

pg_internal.init , .

pg_global

postgres=# SELECT oid, spcname FROM pg_tablespace ; oid | spcname ------+------------ 1663 | pg_default 1664 | pg_global

postgres=# SELECT oid, relfilenode, relname FROM pg_class WHERE reltablespace = 1664 ORDER BY oid;

oid | relfilenode | relname ------+-------------+----------------------------------------- 1136 | 0 | pg_pltemplate 1137 | 0 | pg_pltemplate_name_index 1213 | 0 | pg_tablespace 1214 | 0 | pg_shdepend 1232 | 0 | pg_shdepend_depender_index 1233 | 0 | pg_shdepend_reference_index 1260 | 0 | pg_authid 1261 | 0 | pg_auth_members 1262 | 0 | pg_database 2396 | 0 | pg_shdescription 2397 | 0 | pg_shdescription_o_c_index 2671 | 0 | pg_database_datname_index 2672 | 0 | pg_database_oid_index

, pg_global. pg_class , tablespace global. . . , relfilenode . , pg_filenode.map, .

pg_global (2)

postgres=# SELECT oid, relfilenode, relname FROM pg_class WHERE reltablespace = 1664 ORDER BY oid;

oid | relfilenode | relname ------+-------------+----------------------------------------- 2676 | 0 | pg_authid_rolname_index 2677 | 0 | pg_authid_oid_index 2694 | 0 | pg_auth_members_role_member_index 2695 | 0 | pg_auth_members_member_role_index 2697 | 0 | pg_tablespace_oid_index 2698 | 0 | pg_tablespace_spcname_index 2846 | 0 | pg_toast_2396 2847 | 0 | pg_toast_2396_index 2964 | 0 | pg_db_role_setting 2965 | 0 | pg_db_role_setting_databaseid_rol_index 2966 | 0 | pg_toast_2964 2967 | 0 | pg_toast_2964_index 3592 | 0 | pg_shseclabel 3593 | 0 | pg_shseclabel_object_index 4060 | 0 | pg_toast_3592 4061 | 0 | pg_toast_3592_index 6000 | 0 | pg_replication_origin 6001 | 0 | pg_replication_origin_roiident_index 6002 | 0 | pg_replication_origin_roname_index

.

Tablespaces

src/include/catalog/pg_tablespace.h

pg_tblspc/

view pg_tablespace

!

PostgreSQL. . -, , . , .

Even though located outside the main PostgreSQL data directory, tablespaces are an integral part of the database cluster and cannot be treated as an autonomous collection of data files. They are dependent on metadata contained in the main data directory, and therefore cannot be attached to a different database cluster or backed up individually. Similarly, if you lose a tablespace (file deletion, disk failure, etc), the database cluster might become unreadable or unable to start. Placing a tablespace on a temporary file system like a RAM disk risks the reliability of the entire cluster.

$PGDATA/base

ls -CF1/ 12423/ 12424/ 16501/ pgsql_tmp/contrib/oid2name

$oid2nameAll databases: Oid Database Name Tablespace---------------------------------- 16501 db pg_default 12424 postgres pg_default 12423 template0 pg_default 1 template1 pg_default

. base. . . .

oid2name.... . , .

pg_relation_filepath

src/backend/utils/adt/dbsize.cdb=# \df pg*relation* List of functions Schema | Name | Result data type | Argument data types | Type ------------+--------------------------+------------------+---------------------+-------- pg_catalog | pg_filenode_relation | regclass | oid, oid | normal pg_catalog | pg_relation_filenode | oid | regclass | normal pg_catalog | pg_relation_filepath | text | regclass | normal pg_catalog | pg_relation_is_updatable | integer | regclass, boolean | normal pg_catalog | pg_relation_size | bigint | regclass | normal pg_catalog | pg_relation_size | bigint | regclass, text | normal pg_catalog | pg_total_relation_size | bigint | regclass | normal

db=# SELECT pg_relation_filepath('tbl'); pg_relation_filepath ---------------------- base/16497/16498

db=# SELECT pg_filenode_relation(0, 16498); pg_filenode_relation ---------------------- tbl

oid2name. relation .

$PGDATA/base/16501

db=# CREATE TABLE tbl (a int, b int);

db=# SELECT oid, relname, relfilenode FROM pg_class WHERE relname='tbl'; oid | relname | relfilenode -------+---------+------------- 16502 | tbl | 16502

~/projects/postgresql_data/base/16501$ ls 16502*16502

~/projects/postgresql_data/base/16501$ wc -c 16502/* */

db=# INSERT INTO tbl VALUES (1,1);~/projects/postgresql_data/base/16501$ wc -c 16502/* */

. . oid2name. . . , .

pg_buffercache

db=# CREATE EXTENSION pg_buffercache;db=# SELECT * FROM pg_buffercache WHERE relfilenode ='tbl'::regclass ;-[ RECORD 1 ]----+------bufferid | 454relfilenode | 16502reltablespace | 1663reldatabase | 16501relforknumber | 0relblocknumber | 0isdirty | tusagecount | 2pinning_backends | 0

db=# CHECKPOINT;db=# SELECT * FROM pg_buffercache WHERE relfilenode ='tbl'::regclass ;-[ RECORD 1 ]----+------bufferid | 454relfilenode | 16502reltablespace | 1663reldatabase | 16501relforknumber | 0relblocknumber | 0isdirty | fusagecount | 2pinning_backends | 0

. . oid2name. . . , .

pageinspect

~/projects/postgresql_data/base/16501$ wc -c 16502/* !*/db=# CREATE EXTENSION pageinspect;db=# select * from heap_page_items(get_raw_page('tbl',0));-[ RECORD 1 ]-------------------lp | 1lp_off | 8160lp_flags | 1lp_len | 32t_xmin | 720t_xmax | 0t_field3 | 0t_ctid | (0,1)t_infomask2 | 2t_infomask | 2048t_hoff | 24t_bits | t_oid | t_data | \x0100000001000000

. . oid2name. . . , .

oid vs relfilenode

db=# SELECT oid, relname,relfilenode FROM pg_class WHERE relname='tbl';

oid | relname | relfilenode -------+---------+------------- 16502 | tbl | 16502

db=# TRUNCATE tbl ;

db=# SELECT oid, relname,relfilenode FROM pg_class WHERE relname='tbl';

oid | relname | relfilenode -------+---------+------------- 16502 | tbl | 16531

, truncate, cluster, relfilenode , OID .

RelFileNode

src/include/storage/relfilenode.h

typedef struct RelFileNode{OidspcNode;/* tablespace */OiddbNode;/* database */OidrelNode;/* relation */} RelFileNode;

typedef struct RelFileNodeBackend{RelFileNode node;BackendIdbackend;} RelFileNodeBackend;

relfilenode, , relation. relation , .

Storage manager

Storage manager

src/backend/storage/smgr/READMEsrc/backend/storage/smgr.csrc/backend/storage/md.c

Berkley Postgres storage managers. Postgres. storage manager - (magnetic disk). , , . storage manager. , , , - . berkley relation , smgr. . - smgr, tablespace.

smgr.c smgr, SmgrRelationmd.c API smgr API .

() I/O

src/include/storage/bufpage.hsrc/backend/access/nbtree/nbtpage.c

() -

src/include/storage/block.htypedef uint32 BlockNumber;#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF)#define MaxBlockNumber(BlockNumber) 0xFFFFFFFE)

typedef struct BlockIdData{uint16bi_hi;uint16bi_lo;} BlockIdData;

/* block identifier */typedef BlockIdData *BlockId;

* each data file (heap or index) is divided into postgres disk blocks * (which may be thought of as the unit of i/o -- a postgres buffer * contains exactly one disk block). the blocks are numbered * sequentially, 0 to 0xFFFFFFFE. * * InvalidBlockNumber is the same thing as P_NEW in buf.h. * * the access methods, the buffer manager and the storage manager are * more or less the only pieces of code that should be accessing disk * blocks directly.

* this is a storage type for BlockNumber. in other words, this type * is used for on-disk structures (e.g., in HeapTupleData) whereas * BlockNumber is the type on which calculations are performed (e.g., * in access method code). * * there doesn't appear to be any reason to have separate types except * for the fact that BlockIds can be SHORTALIGN'd (and therefore any * structures that contains them, such as ItemPointerData, can also be * SHORTALIGN'd). this is an important consideration for reducing the * space requirements of the line pointer (ItemIdData) array on each * page and the header of each heap or index tuple, so it doesn't seem * wise to change this without good reason.

Relation
+ (forks)

Forks

src/include/common/relpath.htypedef enum ForkNumber{InvalidForkNumber = -1,MAIN_FORKNUM = 0,FSM_FORKNUM,VISIBILITYMAP_FORKNUM,INIT_FORKNUM} ForkNumber;

src/common/relpath.cconst char *const forkNames[] = {"main",/* MAIN_FORKNUM */"fsm",/* FSM_FORKNUM */"vm",/* VISIBILITYMAP_FORKNUM */"init"/* INIT_FORKNUM */};

4 .

MAIN

src/include/catalog/storage.hsrc/backend/catalog/storage.csrc/include/storage/buf.hsrc/include/storage/bufmgr.h

voidRelationCreateStorage(RelFileNode rnode, char relpersistence)

Buffer ReadBuffer (Relation reln, BlockNumber blockNum)

relation. , , . . , storage . , drop table ,

Since most code wants to access the main fork, a shortcut version ofReadBuffer that accesses MAIN_FORKNUM is provided in the buffer manager forconvenience.

FreeSpaceMap

FreeSpaceMap

src/backend/storage/freespace/READMEsrc/backend/storage/freespace/freespace.c

BlockNumberGetPageWithFreeSpace(Relation rel, Size spaceNeeded)

FSM 8.4.FSM . 1/256. freespace BLCKSZ/256.: , root.

VisibilityMap

src/backend/access/heap/visibilitymap.c

2 :All visible

All frozen

_init

http://www.postgresql.org/docs/devel/static/storage-init.html

unlogged Relations

src/include/storage/bufpage.h

* +----------------+---------------------------------+ * | PageHeaderData | linp1 linp2 linp3 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ pd_lower | * | | * | v pd_upper | * +-------------+------------------------------------+ * | | tupleN ... | * +-------------+------------------+-----------------+ * |... tuple3 tuple2 tuple1 | "special space" | * +--------------------------------+-----------------+

- ) )

PageHeaderData

src/include/storage/bufpage.h

typedef struct PageHeaderData{/* XXX LSN is member of *any* block, not only page-organized ones */PageXLogRecPtr pd_lsn;/* LSN: next byte after last byte of xlog * record for last change to this page */uint16pd_checksum;/* checksum */uint16pd_flags;/* flag bits, see below */LocationIndex pd_lower;/* offset to start of free space */LocationIndex pd_upper;/* offset to end of free space */LocationIndex pd_special;/* offset to start of special space */uint16pd_pagesize_version;TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ItemIdDatapd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */

} PageHeaderData;

PageAddItem

src/include/storage/bufpage.hsrc/backend/storage/page/bufpage.c

OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap)

. .

OID wraparond?

OID?

.

pageinspect

gevel pageinspect

, / ( ). pageinspect.

default

http://www.interdb.jp/pg/pgsql01.html

http://www.slideshare.net/FedericoCampoli/10-things-postgresql

https://momjian.us/main/presentations/internals.html

https://wiki.postgresql.org/images/8/81/FSM_and_Visibility_Map.pdf

,
Feature Freeze

https://commitfest.postgresql.org/9/433/

https://commitfest.postgresql.org/9/494/

31.03 07.04

! ?

Hacking [email protected]