SPARQLthon22/AssemblyReports

提供:TogoWiki

移動: 案内, 検索

NCBI Assembly_Reportsのメタデータを調査する

  • GENOME_REPORTSメタデータでは真核生物の配列Accessionを取得できないため、ASSEMBLY_REPORTSの内容を確認する
    • 2014年7月8日版 assembly_summary_refseq は18262件。そのうちversion_status “latest”は17454件
    • 個別のAll/*.assembly.txtファイルに配列アクセッション、配列ラベル、配列タイプが記述されているためRDF試作した
    • Sequence Ontologyのアサインメントに関連するメタデータ等を確認した

メタデータ

  • assembly_level
    • "Gapless Chromosome" 2739
    • "Chromosome" 513
    • "Chromosome with gaps" 187
    • "Contig" 9355
    • "Scaffold" 5468
  • genome_rep
    • "Full" 18188
    • "Partial" 74
  • refseq_category
    • "reference-genome" 91
    • "representative-genome" 3433
    • "na" 14738
  • release_type
    • "Patch" 10
    • "Major" 18252
  • version_status
    • "latest" 17454
    • "replaced" 808
  • relationship
    • "=" 7611212
    • "" 1
    • "<>" 1584971
  • release_type
    • "Patch" 10
    • "Major" 18252
  • sequence_role
    • "assembled-molecule" 10019
    • "novel-patch" 478
    • "" 1
    • "alt-scaffold" 8444
    • "unplaced-scaffold" 8811124
    • "fix-patch" 656
    • "pseudo-scaffold" 15
    • "unlocalized-scaffold" 365447

サンプルRDF

@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix asm: <http://www.ncbi.nlm.nih.gov/assembly/> .

[
       asm:assembly_id "GCF_000009725.1" ;
       asm:bioproject_accession        "PRJNA57659" ;
       asm:bioproject  <http://identifiers.org/bioproject/PRJNA57659> ;
       asm:biosample_accession "na" ;
       asm:wgs_master  "na" ;
       asm:refseq_category     "representative-genome" ;
       asm:tax_id      "1148" ;
       asm:taxon       <http://identifiers.org/taxonomy/1148> ;
       asm:species_taxid       "1148" ;
       asm:organism_name       "Synechocystis sp. PCC 6803" ;
       asm:infraspecific_name  "strain=PCC 6803" ;
       asm:isolate     "na" ;
       asm:version_status      "latest" ;
       asm:assembly_level      "Gapless Chromosome" ;
       asm:release_type        "Major" ;
       asm:genome_rep  "Full" ;
       asm:release_date        "2004/05/11" ;
       asm:asm_name    "ASM972v1" ;
       asm:submitter   "Kazusa" ;
       asm:gbrs_paired_asm     "GCA_000009725.1" ;
       asm:paired_asm_comp     "identical" ;
       rdfs:seeAlso    <http://www.ncbi.nlm.nih.gov/assembly/GCF_000009725.1> ;
       asm:sequnece    [
               asm:sequence_name       "ANONYMOUS" ;
               asm:sequence_role       "assembled-molecule" ;
               asm:assigned_molecule   "na" ;
               asm:assigned_molecule_location_type     "Chromosome" ;
               asm:genbank_accession   "BA000022.2" ;
               asm:genbank     <http://identifiers.org/insdc/BA000022.2> ;
               asm:relationship        "=" ;
               asm:refseq_accession    "NC_000911.1" ;
               asm:refseq      <http://identifiers.org/refseq/NC_000911.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
       asm:sequnece    [
               asm:sequence_name       "pSYSA" ;
               asm:sequence_role       "assembled-molecule" ;
               asm:assigned_molecule   "pSYSA" ;
               asm:assigned_molecule_location_type     "Plasmid" ;
               asm:genbank_accession   "AP004311.1" ;
               asm:genbank     <http://identifiers.org/insdc/AP004311.1> ;
               asm:relationship        "=" ;
               asm:refseq_accession    "NC_005230.1" ;
               asm:refseq      <http://identifiers.org/refseq/NC_005230.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
       asm:sequnece    [
               asm:sequence_name       "pSYSG" ;
               asm:sequence_role       "assembled-molecule" ;
               asm:assigned_molecule   "pSYSG" ;
               asm:assigned_molecule_location_type     "Plasmid" ;
               asm:genbank_accession   "AP004312.1" ;
               asm:genbank     <http://identifiers.org/insdc/AP004312.1> ;
               asm:relationship        "=" ;
               asm:refseq_accession    "NC_005231.1" ;
               asm:refseq      <http://identifiers.org/refseq/NC_005231.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
       asm:sequnece    [
               asm:sequence_name       "pSYSM" ;
               asm:sequence_role       "assembled-molecule" ;
               asm:assigned_molecule   "pSYSM" ;
               asm:assigned_molecule_location_type     "Plasmid" ;
               asm:genbank_accession   "AP004310.1" ;
               asm:genbank     <http://identifiers.org/insdc/AP004310.1> ;
               asm:relationship        "=" ;
               asm:refseq_accession    "NC_005229.1" ;
               asm:refseq      <http://identifiers.org/refseq/NC_005229.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
       asm:sequnece    [
               asm:sequence_name       "pSYSX" ;
               asm:sequence_role       "assembled-molecule" ;
               asm:assigned_molecule   "pSYSX" ;
               asm:assigned_molecule_location_type     "Plasmid" ;
               asm:genbank_accession   "AP006585.1" ;
               asm:genbank     <http://identifiers.org/insdc/AP006585.1> ;
               asm:relationship        "=" ;
               asm:refseq_accession    "NC_005232.1" ;
               asm:refseq      <http://identifiers.org/refseq/NC_005232.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
]
.
[
       asm:assembly_id "GCF_000002595.1" ;
       asm:bioproject_accession        "PRJNA21061" ;
       asm:bioproject  <http://identifiers.org/bioproject/PRJNA21061> ;
       asm:biosample_accession "na" ;
       asm:wgs_master  "ABCN00000000.1" ;
       asm:refseq_category     "representative-genome" ;
       asm:tax_id      "3055" ;
       asm:taxon       <http://identifiers.org/taxonomy/3055> ;
       asm:species_taxid       "3055" ;
       asm:organism_name       "Chlamydomonas reinhardtii" ;
       asm:infraspecific_name  "strain=CC-503 cw92 mt+" ;
       asm:isolate     "na" ;
       asm:version_status      "latest" ;
       asm:assembly_level      "Scaffold" ;
       asm:release_type        "Major" ;
       asm:genome_rep  "Full" ;
       asm:release_date        "2007/10/15" ;
       asm:asm_name    "v3.0" ;
       asm:submitter   "DOE Joint Genome Institute" ;
       asm:gbrs_paired_asm     "GCA_000002595.2" ;
       asm:paired_asm_comp     "different" ;
       rdfs:seeAlso    <http://www.ncbi.nlm.nih.gov/assembly/GCF_000002595.1> ;
       asm:sequnece    [
               asm:sequence_name       "CHLREscaffold_1" ;
               asm:sequence_role       "unplaced-scaffold" ;
               asm:assigned_molecule   "na" ;
               asm:assigned_molecule_location_type     "na" ;
               asm:genbank_accession   "DS496108.1" ;
               asm:genbank     <http://identifiers.org/insdc/DS496108.1> ;
               asm:relationship        "=" ;
               asm:refseq_accession    "NW_001843471.1" ;
               asm:refseq      <http://identifiers.org/refseq/NW_001843471.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
       asm:sequnece    [
               asm:sequence_name       "CHLREscaffold_2" ;
               asm:sequence_role       "unplaced-scaffold" ;
               asm:assigned_molecule   "na" ;
               asm:assigned_molecule_location_type     "na" ;
               asm:genbank_accession   "DS496109.1" ;
               asm:genbank     <http://identifiers.org/insdc/DS496109.1> ;
               asm:relationship        "=" ;
               asm:refseq_accession    "NW_001843642.1" ;
               asm:refseq      <http://identifiers.org/refseq/NW_001843642.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
       asm:sequnece    [
               asm:sequence_name       "CHLREscaffold_3" ;
               asm:sequence_role       "unplaced-scaffold" ;
               asm:assigned_molecule   "na" ;
               asm:assigned_molecule_location_type     "na" ;
               asm:genbank_accession   "DS496110.1" ;
               asm:genbank     <http://identifiers.org/insdc/DS496110.1> ;
              asm:relationship        "=" ;
               asm:refseq_accession    "NW_001843733.1" ;
               asm:refseq      <http://identifiers.org/refseq/NW_001843733.1> ;
               asm:assembly_unit       "Primary Assembly" ] ;
# ... 省略
]
.


リンク