The Amalgam Schema and Data Integration Test Suite

 

Renée J. Miller and Daniel Fisla and Mary Huang and David Kalmuk and Fei Ku and Vivian Lee

Goals:

The goals of this research project were to develop a schema integration benchmark. A set of databases with similar content but different schemas were designed by a set of students working independently. The benchmark will be used to test and evaluate data integration methods. The schemas and data are publicly available. However, we do ask that you send us email to retrieve the data so we know which research groups are using the benchmark.

Methodology. Four separate students were asked to do the following tasks.

To receive a tarred source of all the schemas and databases, please email miller@cs.toronto.edu.

To help determine if this benchmark may be of use in your project, we have included below the four schemas (together with the relational DDL statements needed to create the schemas). We will happily provide the data files and DBMS load statements (currently written for DB2) to any research project.

To reference this benchmark, please use the following citation:

@unpublished{Mil+01,
           author = {Ren\'{e}e J. Miller and Daniel Fisla and Mary Huang
                     and David Kymlicka and Fei Ku and Vivian Lee},
           title = {{The Amalgam Schema and Data Integration Test Suite}},
           note = {www.cs.toronto.edu/~miller/amalgam},
           year = 2001,
           }

ER Diagram of the Schemas together with the SQL-DDL.

The individual schemas are named s1, s2, s3 and s4. All schemas were created in a master SQL database named "amalgam".

 

- please give time to load -

 

Schema S1 The original data source for s1 was a BIBTEX file was downloaded from the site: http://liinwww.ira.uka.de/bibliography/Database/Wiederhold/index.html

1. ER Diagram submitted by Fei Ku:

create table s1.InProceedings \
  (inprocID char(35) not null, title char(254), bktitle char(250), year int, \
   month char(8), pages char(12), vol int, num int, loc char(150), \
   class char(150), note char(254), annote varchar(2800), \
   primary key(inprocID))

create table s1.Article \
  (articleID char(35) not null, title char(254), journal char(150), year int, \
   month char(8), pages char(12), vol int, num int, loc char(150), \
   class char(150), note char(254), annote varchar(2800), \
   primary key(articleID))

create table s1.TechReport \
  (techID char(35) not null, title char(254), inst char(200), year int, \
   month char(8), pages char(12), vol int, num int, loc char(150), \
   class char(150), note char(254), annote varchar(2800), \
   primary key(techID))

create table s1.Book \
  (bookID char(35) not null, title char(254), publisher char(200), year int, \
   month char(8), pages char(12), vol int, num int, loc char(150), \
   class char(150), note char(254), annote varchar(2800), \
   primary key(bookID))

create table s1.InCollection \
  (collID char(35) not null, title char(254), bktitle char(250), year int, \
   month char(8), pages char(12), vol int, num int, loc char(150), \
   class char(150), note char(254), annote varchar(2800), \
   primary key(collID))

create table s1.Misc \
  (miscID char(35) not null, title char(254), howpub char(200), \
   confloc char(100),  year int, \
   month char(8), pages char(12), vol int, num int, loc char(150), \
   class char(150), note char(254), annote varchar(2800), \
   primary key(miscID))

create table s1.Manual \
  (manID char(35) not null, title char(254), org char(200), year int, \
   month char(8), pages char(12), vol int, num int, loc char(150), \
   class char(150), note char(254), annote varchar(2800), \
   primary key(manID))

create table s1.Author \
   (AuthID int not null, name char(80) not null, \
    primary key (AuthID))

--RELATIONSHIP TABLES

create table s1.InprocPublished \
   (inprocID char(35) not null, AuthID int not null, \
    primary key(inprocID, AuthID), \
    foreign key (inprocID) references s1.InProceedings \
    on delete CASCADE, foreign key (AuthID) \
    references s1.Author on delete CASCADE)

create table s1.ArticlePublished \
   (articleID char(35) not null, AuthID int not null, \
    primary key(articleID, AuthID), \
    foreign key (articleID) references s1.Article \
    on delete CASCADE, foreign key (AuthID) \
    references s1.Author on delete CASCADE)

create table s1.TechPublished \
   (techID char(35) not null, AuthID int not null, \
    primary key(techID, AuthID), \
    foreign key (techID) references s1.TechReport \
    on delete CASCADE, foreign key (AuthID) \
    references s1.Author on delete CASCADE)

create table s1.BookPublished \
   (bookID char(35) not null, AuthID int not null, \
    primary key(bookID, AuthID), \
    foreign key (bookID) references s1.Book \
    on delete CASCADE, foreign key (AuthID) \
    references s1.Author on delete CASCADE)

create table s1.InCollPublished \
   (collID char(35) not null, AuthID int not null, \
    primary key(collID, AuthID), \
    foreign key (collID) references s1.InCollection \
    on delete CASCADE, foreign key (AuthID) \
    references s1.Author on delete CASCADE)

create table s1.MiscPublished \
   (miscID char(35) not null, AuthID int not null, \
    primary key(miscID, AuthID), \
    foreign key (miscID) references s1.Misc \
    on delete CASCADE, foreign key (AuthID) \
    references s1.Author on delete CASCADE)

create table s1.ManualPublished \
   (manID char(35) not null, AuthID int not null, \
    primary key (manID, AuthID), \
    foreign key (manID) references s1.Manual \
    on delete CASCADE, foreign key (AuthID) \
    references s1.Author on delete CASCADE)

2. ER Diagram submitted by Vivian Lee:

Schema S2 The original data source was the object-oriented bibliograph downloaded from the site: http://iamwww.unibe.ch/~scg/Resources/Bib/scg.bib

create table s2.allBibs ( citKey char(20) not null, primary key (citKey) )

 

create table s2.citForm \

            ( citKey char(20) not null, form  char(20), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.authors \

            ( citKey char(20) not null, autNm char(100) not null, \

              primary key     (citKey, autNm), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.editors \

            ( citKey char(20) not null, \

              edNm              char(100) not null, primary key    (citKey, edNm), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.titles \

            ( citKey char(20) not null, title  char(200), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.schools \

            ( citKey char(20) not null, schoolNm  char(100), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.institutions \

            ( citKey char(20) not null, institNm  char(100), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.addresses \

            ( citKey char(20) not null, address  char(100), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.types \

            ( citKey char(20) not null, type  char(100), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.ISBN \

            ( citKey char(20) not null, isbnNum  char(20), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.notes \

            ( citKey char(20) not null, note  char(200) not null, \

              primary key     (citKey, note), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.volumes \

            ( citKey char(20) not null, volNum  char(50), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.numbers \

            ( citKey             char(20) not null, num  char(50), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.months \

            ( citKey char(20) not null, mon  char(20), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.years \

            ( citKey             char(20) not null, yr  char(20), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.pages \

            ( citKey char(20) not null, pgRange  char(50), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.abstracts \

            ( citKey char(20) not null, txt  varchar(3500), \

              primary key     (citKey), \

              foreign key      (citKey) references s2.allBibs      on delete cascade )

 

create table s2.publisher \

            ( pubNm  char(100), pubID  integer not null, \

              primary key     (pubID) )

 

create table s2.citPublisher \

            ( citKey char(20) not null, pubID  integer not null, \

              primary key     (citKey, pubID), \

              foreign key      (citKey) references s2.allBibs      on delete cascade, \

              foreign key      (pubID) references s2.publisher on delete cascade )

 

create table s2.journal \

            ( jrnlNm char(200), jrnlID integer not null, \

              primary key (jrnlID) )

 

create table s2.citJournal \

            ( citKey char(20) not null, jrnlID    integer not null, \

              primary key     (citKey, jrnlID), \

              foreign key      (citKey) references s2.allBibs      on delete cascade, \

              foreign key      (jrnlID) references s2.journal        on delete cascade )

 

create table s2.series \

            ( seriesNm        char(50), seriesID  integer not null, \

              primary key     (seriesID) )

 

create table s2.citSeries \

            ( citKey char(20) not null, seriesID  integer not null, \

              primary key     (citKey, seriesID), \

              foreign key      (citKey) references s2.allBibs      on delete cascade, \

              foreign key      (seriesID) references s2.series on delete cascade )

 

create table s2.booktitle \

            ( bkTitleNm        char(100), bktitleID  integer not null, \

              primary key     (bktitleID) )

 

create table s2.citBkTitle \

            ( citKey char(20) not null, bktitleID  integer not null, \

              primary key     (citKey, bktitleID), \

              foreign key      (citKey) references s2.allBibs      on delete cascade, \

              foreign key      (bktitleID) references s2.booktitle on delete cascade )

 

create table s2.keyWord \

            ( word   char(50), keyWdID  integer not null, \

              primary key     (keyWdID) )

 

create table s2.citKeyWd \

            ( citKey char(20) not null, keyWdID  integer not null, \

              primary key     (citKey, keyWdID), \

              foreign key      (citKey) references s2.allBibs      on delete cascade, \

              foreign key      (keyWdID) references s2.keyWord on delete cascade )

 

3. ER Diagram submitted by Mary Huang:

Schema S3 The original data source was downloaded from Information and Computation web site: http://theory.lcs.mit.edu/~iandc/homepage.html.

create table s3.article ( \

      articleID varchar(30) not null, \

      title varchar(150) not null, \

      volume int not null, \

      number varchar(20)not null, \

      pages varchar(30) not null, \

      month varchar(100) not null, \

      year int not null, \

      refkey varchar(50), \

      note varchar(150), \

      remarks varchar(400), \

      references varchar(2000), \

      xxxreferences varchar(600), \

      fullxxxreferences varchar(400), \

      oldkey varchar(50), \

      abstract varchar(3000), \

        preliminary varchar(100), \

      primary key (articleID)) IN CSC494TABLESPACE

 

create table s3.author ( \

        authorID int not null, \

        name varchar(40) not null, \  

        primary key (authorID))

 

 

create table s3.unpublished ( \

        unpubID varchar(30) not null, \

        title varchar(150) not null, \

        refkey varchar(20), \

        note varchar(50), \

        preliminary varchar(100), \

        references varchar(2000), \

      primary key(unpubID))

 

create table s3.articleAuthor ( \

      articleID varchar(30) not null, \

      authorID int not null, \

      primary key (articleID, authorID), \

      foreign key (articleID) references s3.article \

                                on delete cascade, \

      foreign key (authorID) references s3.author \

                               on delete restrict)

 

create table s3.unpubAuthor ( \

      unpubID varchar(30) not null, \

      authorID int not null, \

      primary key (unpubID, authorID), \

      foreign key (unpubID) references s3.unpublished \

                              on delete cascade, \

      foreign key (authorID) references s3.author \

                               on delete restrict)

 

 

 

4. ER Diagram submitted by Daniel Fisla:

 

create table s4.author \

( AID INTEGER NOT NULL, Name VARCHAR(50) NOT NULL UNIQUE, \

  Affiliations VARCHAR(200), \

  PRIMARY KEY(AID) )

 

create table s4.descriptor \

( DID INTEGER NOT NULL, Descriptor VARCHAR(50) NOT NULL UNIQUE, \

  PRIMARY KEY(DID) )

 

create table s4.location \

( LID INTEGER NOT NULL , CountryPub VARCHAR(50) NOT NULL, \

  CountryOrigin VARCHAR(50) NOT NULL, \

  UNIQUE(CountryPub,CountryOrigin), \

  PRIMARY KEY(LID) )

 

CREATE TABLE s4.publication \

( PID INTEGER NOT NULL, Title VARCHAR(500) NOT NULL, \

  TitleExt VARCHAR(100), Abstract VARCHAR(3000) NOT NULL, \

  AbstractInd VARCHAR(5) NOT NULL, Language VARCHAR(50) NOT NULL, \

  Journal VARCHAR(255), JournalAnn VARCHAR(20), ConfInfo VARCHAR(500), \

  Book VARCHAR(500), Category VARCHAR(100) NOT NULL, \

  PRIMARY KEY (PID) )

 

CREATE TABLE s4.record \

( RID INTEGER NOT NULL, Availability VARCHAR(255) NOT NULL, \

  UpdateCode VARCHAR(25) NOT NULL, NumRef VARCHAR(3), \

  ContractNum VARCHAR(100), ISSN VARCHAR(25), \

  ISBN VARCHAR(25), Notes VARCHAR(100), \

  Subfile VARCHAR(10) NOT NULL, Source VARCHAR(255), \

  Series VARCHAR(100), AccessionNum VARCHAR(100) NOT NULL, \

  PRIMARY KEY (RID) )

 

CREATE TABLE s4.Described \

 (PID INTEGER NOT NULL, DID INTEGER NOT NULL, \

  PRIMARY KEY(PID, DID), \

  FOREIGN KEY(PID) REFERENCES s4.publication, \

  FOREIGN KEY(DID) REFERENCES s4.descriptor)

 

CREATE TABLE s4.Located \

 (PID INTEGER NOT NULL, LID INTEGER NOT NULL, \

  PRIMARY KEY(PID, LID), \

  FOREIGN KEY(PID) REFERENCES s4.publication, \

  FOREIGN KEY(LID) REFERENCES s4.location)

 

CREATE TABLE s4.Recorded \

 (PID INTEGER NOT NULL, RID INTEGER NOT NULL, RecordType VARCHAR(100), \

  PRIMARY KEY(PID, RID), \

  FOREIGN KEY(PID) REFERENCES s4.publication, \

  FOREIGN KEY(RID) REFERENCES s4.record)

 

CREATE TABLE s4.Written \

 (AID INTEGER NOT NULL, PID INTEGER NOT NULL, \

  PubYear VARCHAR(100) NOT NULL, Publisher VARCHAR(100), \

  PRIMARY KEY(PID, AID), \

  FOREIGN KEY(PID) REFERENCES s4.publication, \

  FOREIGN KEY(AID) REFERENCES s4.author)