Difference between revisions of "Encoding Relational Tables in NetCDF"

From Earth Science Information Partners (ESIP)
Line 168: Line 168:
 
* '''There can be any combination of the two, enabling the best of both worlds whatever your data will look like.'''
 
* '''There can be any combination of the two, enabling the best of both worlds whatever your data will look like.'''
  
== More Real Examples ==
+
== More Real World Examples ==
  
CIRA/VIEWS
+
These examples are not full real-life examples of real live databases, but rather examples how to encode a typical schema. The center is usually the data table, and the data record refers to the auxliary metadata tables. Typically cases are a star or a snowflake schema.
  
NILU/EMEP
+
=== CIRA/VIEWS ===
 +
 
 +
All the data is stored at the center table AirData3. The
 +
 
 +
 
 +
=== NILU/EMEP ===
 +
 
 +
 
 +
 
 +
create table EB_STATION (
 +
EB_STATION_CODE                char(7)                          not null  ,
 +
EB_NAME                        varchar(255)                        null  ,
 +
EB_LONGITUDE                    float(16)                            null  ,
 +
EB_LATITUDE                    float(16)                            null  ,
 +
EB_ALTITUDE_ASL                float(16)                            null  ,
 +
PRIMARY KEY CLUSTERED ( EB_STATION_CODE )  on 'default'
 +
)
 +
 
 +
 
 +
 
 +
 
 +
 
 +
 
 +
create table DS_DATA_SETS (
 +
DS_SETKEY                      numeric(10,0)                    identity  ,
 +
EB_STATION_CODE                char(7)                              null  ,
 +
ER_REGIME_CODE                  char(3)                          not null  ,
 +
EM_MATRIX_NAME                  varchar(40)                      not null  ,
 +
EC_COMP_NAME                    varchar(100)                    not null  ,
 +
DS_STARTDATE                    datetime                        not null  ,
 +
DS_ENDDATE                      datetime                            null  ,
 +
PRIMARY KEY CLUSTERED ( DS_SETKEY )  on 'default',
 +
CONSTRAINT TR_RES CHECK  (TR_RESOLUTION_CODE in
 +
('1s','2s','30s','1mn','5mn','10mn','15mn','20mn','30mn','1h','2h','6h','1d','2d','1w','2w','4w','1mo','3mo','1y')))
 +
create nonclustered index REF_DATASETS_STATIONS_FK
 +
on ebas_input.dbo.DS_DATA_SETS(EB_STATION_CODE)
 +
 
 +
 
 +
 
 +
 
 +
create table DP_DATA_SETS_PROJECT (
 +
DS_SETKEY                      numeric(10,0)                    not null  ,
 +
PR_ACRONYM                      varchar(40)                      not null  ,
 +
CONSTRAINT PK_DP_DATA_SETS_PROJECT PRIMARY KEY CLUSTERED ( DS_SETKEY, PR_ACRONYM )  on 'default'
 +
)
 +
create nonclustered index REF_DATA_SETS_PROJECT_DS_DATA_
 +
on ebas_input.dbo.DP_DATA_SETS_PROJECT(PR_ACRONYM)
 +
create nonclustered index REF_DATA_SETS_PROJECT_PR_PROJE
 +
on ebas_input.dbo.DP_DATA_SETS_PROJECT(DS_SETKEY)
 +
 
 +
 
 +
 
 +
 
 +
 
 +
create table A1_TIME (
 +
DS_SETKEY                      numeric(10,0)                    not null  ,
 +
A1_STARTTIME                    datetime                        not null  ,
 +
A1_ENDTIME                      datetime                        not null  ,
 +
A1_VALUE                        float(16)                            null  ,
 +
A1_NUMFLAG                      decimal(10,9)                        null  ,
 +
A1_CODES                        char(4)                              null  ,
 +
PRIMARY KEY CLUSTERED ( DS_SETKEY, A1_STARTTIME )  on 'default'
 +
)
 +
create nonclustered index REF_A1TIME_DATASET_FK
 +
on ebas_input.dbo.A1_TIME(DS_SETKEY)
 +
 
 +
 
 +
 
 +
alter table ebas_input.dbo.DS_DATA_SETS
 +
add constraint DS_DATA_SE_1632008845 FOREIGN KEY (EB_STATION_CODE) REFERENCES ebas_input.dbo.EB_STATION(EB_STATION_CODE)
 +
go
 +
 
 +
alter table ebas_input.dbo.DP_DATA_SETS_PROJECT
 +
add constraint FK_DP_DATA__REF_DATAS_DS_DATA_ FOREIGN KEY (DS_SETKEY) REFERENCES ebas_input.dbo.DS_DATA_SETS(DS_SETKEY)
 +
go
 +
 
 +
alter table ebas_input.dbo.A1_TIME
 +
add constraint A1_TIME_1904009814 FOREIGN KEY (DS_SETKEY) REFERENCES ebas_input.dbo.DS_DATA_SETS(DS_SETKEY)
 +
go

Revision as of 10:34, August 31, 2011

Back to Questions and Comments about CF-1.6 Station Data Convention

Rational Behind this Proposal

The WCS service has potential to grow beyond regular, gridded coverages. Coverage types that need to be described are for example:

  • station point data
  • station trajectory data
  • lidar data
  • moving station data, like aeroplane or ship

With a relational system, it is easy to design a schema that captures all the data and the metadata.

The netCDF data format was designed to contain mainly gridded multidimensional data. The format does not have a special support for tables, which sounds like a severe restriction, but it's not.

  • It's possible to agree about a simple way to encode tables in a netCDF file.
    • Tables with multiple fields.
    • Proper primary keys for tables.
    • Foreign keys for referential integrity.
    • Common relationships: many-to-many, 1-to-many, 1-to-1.
  • It's possible to be efficient, the encoding does not have to be one size fits all.
    • Sparse tables can be encoded row-by-row manner.
    • Dense tables can be encoded as multidimensional hypercubes.
    • Mix and match the two above at will.
  • It's possible to be elegant.
    • No mysterious indirections.
    • No complex offset calculations.
    • A reader-writer library can take care of following the rules, letting a programmer to operate on higher level.

Encoding Table Columns

Table is a collection of columns. These are indicated with table_name attribute.

Every table needs a primary key. It is indicated with primary_key = "T" attribute.

Example: A station table with three columns: station_code, lat and lon. There are ten rows with indexes 0..9

   dimensions:
       station = 10 ;  // station table dimension
       station_code_length = 4 ; // artificial dimension for station code, with max four character length.
   variables:
       char station_code(station, station_code_length) ;
           station_code:table_name = "station";
           station_code:primary_key = "T";
       float lon(station) ; 
           lon:table_name = "station";
       float lat(station) ; 
           lat:table_name = "station";

The dimension name station can be anything. It does not have to equal the table name station, but should be for readability. Same for station_code_length, it can be called anything.

A table can have multiple fields as a primary key.

Encoding Multiple Tables

Adding the second table is similar. Since new data is usually added by time, it can be made the unlimited dimension.

   dimensions:
       station = 10 ;  // station table dimension
       station_code_length = 4 ; // artificial dimension for station code, with max four character length.
       time = UNLIMITED ;
   variables:
       char station_code(station, station_code_length) ;
           station_code:table_name = "station";
           station_code:primary_key = "T";
       float lon(station) ; 
           lon:table_name = "station";
       float lat(station) ; 
           lat:table_name = "station";
       int time(time) ;
           time:table_name = "time";
           time:primary_key = "T";

Encoding Relations

A typical station data instance is a relation (station_code, time, temperature, humidity). The pair (station_code, time) is the primary key; data is recorded at a certain station, at a certain time. Temperature and humidity are the recorded parameters.

Encoding this into netCDF can be done in two ways, using either variable or a dimension.

Encoding Relation as a Record Array

So far we have the two unrelated tables for our two dimensions: location and time. Now let's add humidity and temperature, which refers to both of the dimensions:

   dimensions:
       station = 10 ;  // station table dimension
       station_code_length = 4 ; // artificial dimension for station code, with max four character length.
       data = UNLIMITED // each row in the data table. 
       time = 2000 ; we can't have two unlimited dimensions. 
   variables:
       char station_code(station, station_code_length) ;
           station_code:table_name = "station";
           station_code:primary_key = "T";
       float lon(station) ; 
           lon:table_name = "station";
       float lat(station) ; 
           lat:table_name = "station";
       int time(time) ;
           time:table_name = "time";
           time:primary_key = "T";
       int data_station_code(data) ;
           data_station_code:table_name = "data";
           data_station_code:foreign_key = "station_code";
           data_station_code:primary_key = "T";
       int data_time(data) ;
           data_time:table_name = "data";
           data_time:foreign_key = "time";
           data_time:primary_key = "T";
       float temperature(data) ; 
           temperature:table_name = "data";
       float humidity(data) ; 
           humidity:table_name = "data";

The variable data_station_code is marked foreign_key = "station_code", which means, that the variable is an index to the station_code variable. Similar for time. Both are marked primary_key = "T" which means, that the combination of (data_station_code, data_time) is unique.

   To read the full table (station_code, lat, lon, time, temperature, humidity), row by row:
   for idx in range of data
       read station:
           read data_station_code(idx)
           read variables station_code, lat and lon at that index
       read time:
           read data_time(idx)
           read variable time at that index
       read temperature(idx)
       read humidity(idx)

This is an space-efficient encoding for sparse tables.

Encoding Relation as a Multidimensional Array

This is identical to CF-1.6 encoding. Both variables are 2-dimensional cubes.

   dimensions:
       station = 10 ;  // station table dimension
       station_code_length = 4 ; // artificial dimension for station code, with max four character length.
       time = UNLIMITED ; // we can again add one time slice at a time
   variables:
       char station_code(station, station_code_length) ;
           station_code:table_name = "station";
           station_code:primary_key = "T";
       float lon(station) ; 
           lon:table_name = "station";
       float lat(station) ; 
           lat:table_name = "station";
       int time(time) ;
           time:table_name = "time";
           time:primary_key = "T";
       float temperature(time,station) ; 
           temperature:table_name = "data";
       float humidity(time,station) ; 
           humidity:table_name = "data";

This declares two arrays, one for temperature and one for humidity. Both are two-dimensional. The dimensions time and station refer to the time and station variables. They implicitly became part of the primary key.

   To find temperature at "ACAD" in "2004-17-22":
       Find index for "ACAD" from the "station_code" array. Let's say it's 2.
       Find index for "2004-17-22" from the time array. Lets say it's 317.
       Read temperature(317,2)

This is a very space-efficient encoding, if most of the stations have data most of the time.

Summa Summarum

  • A typical schema of any relational database can be encoded.
  • If the data is sparse, foreign keys can be expressed as variables.
  • If the data is dense, foreign keys can be expressed as dimensions, resulting in vastly smaller files than flat CSV would be.
  • There can be any combination of the two, enabling the best of both worlds whatever your data will look like.

More Real World Examples

These examples are not full real-life examples of real live databases, but rather examples how to encode a typical schema. The center is usually the data table, and the data record refers to the auxliary metadata tables. Typically cases are a star or a snowflake schema.

CIRA/VIEWS

All the data is stored at the center table AirData3. The


NILU/EMEP

create table EB_STATION ( EB_STATION_CODE char(7) not null , EB_NAME varchar(255) null , EB_LONGITUDE float(16) null , EB_LATITUDE float(16) null , EB_ALTITUDE_ASL float(16) null ,

PRIMARY KEY CLUSTERED ( EB_STATION_CODE )  on 'default' 

)




create table DS_DATA_SETS ( DS_SETKEY numeric(10,0) identity , EB_STATION_CODE char(7) null , ER_REGIME_CODE char(3) not null , EM_MATRIX_NAME varchar(40) not null , EC_COMP_NAME varchar(100) not null , DS_STARTDATE datetime not null , DS_ENDDATE datetime null ,

PRIMARY KEY CLUSTERED ( DS_SETKEY )  on 'default',

CONSTRAINT TR_RES CHECK (TR_RESOLUTION_CODE in ('1s','2s','30s','1mn','5mn','10mn','15mn','20mn','30mn','1h','2h','6h','1d','2d','1w','2w','4w','1mo','3mo','1y'))) create nonclustered index REF_DATASETS_STATIONS_FK on ebas_input.dbo.DS_DATA_SETS(EB_STATION_CODE)



create table DP_DATA_SETS_PROJECT ( DS_SETKEY numeric(10,0) not null , PR_ACRONYM varchar(40) not null , CONSTRAINT PK_DP_DATA_SETS_PROJECT PRIMARY KEY CLUSTERED ( DS_SETKEY, PR_ACRONYM ) on 'default' ) create nonclustered index REF_DATA_SETS_PROJECT_DS_DATA_ on ebas_input.dbo.DP_DATA_SETS_PROJECT(PR_ACRONYM) create nonclustered index REF_DATA_SETS_PROJECT_PR_PROJE on ebas_input.dbo.DP_DATA_SETS_PROJECT(DS_SETKEY)



create table A1_TIME ( DS_SETKEY numeric(10,0) not null , A1_STARTTIME datetime not null , A1_ENDTIME datetime not null , A1_VALUE float(16) null , A1_NUMFLAG decimal(10,9) null , A1_CODES char(4) null ,

PRIMARY KEY CLUSTERED ( DS_SETKEY, A1_STARTTIME )  on 'default' 

) create nonclustered index REF_A1TIME_DATASET_FK on ebas_input.dbo.A1_TIME(DS_SETKEY)


alter table ebas_input.dbo.DS_DATA_SETS add constraint DS_DATA_SE_1632008845 FOREIGN KEY (EB_STATION_CODE) REFERENCES ebas_input.dbo.EB_STATION(EB_STATION_CODE) go

alter table ebas_input.dbo.DP_DATA_SETS_PROJECT add constraint FK_DP_DATA__REF_DATAS_DS_DATA_ FOREIGN KEY (DS_SETKEY) REFERENCES ebas_input.dbo.DS_DATA_SETS(DS_SETKEY) go

alter table ebas_input.dbo.A1_TIME add constraint A1_TIME_1904009814 FOREIGN KEY (DS_SETKEY) REFERENCES ebas_input.dbo.DS_DATA_SETS(DS_SETKEY) go