Working With Sequences

Eric Archer

2017-04-10

There are several functions for working with sequence data in strataG. They will either take a haploid gtypes object that contains sequences or some format that can be converted to a DNAbin (in the ape package), or multidna (in the apex package) object.

Leading and trailing N’s can be removed from all sequences like this:

library(ape)
data(dolph.seqs)

i <- sample(1:10, 1)
j <- sample(1:10, 1)
x <- c(rep("n", i), dolph.seqs[[1]], rep("n", j))
x
##   [1] "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" "g" "a" "a" "a" "a" "a" "-"
##  [18] "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c" "a" "g" "t" "t" "a" "c"
##  [35] "c" "a" "c" "a" "a" "c" "a" "t" "c" "a" "c" "a" "g" "t" "a" "c" "t"
##  [52] "a" "c" "g" "t" "c" "a" "g" "t" "a" "t" "t" "a" "a" "a" "a" "g" "t"
##  [69] "a" "a" "t" "t" "t" "g" "t" "t" "t" "t" "a" "a" "a" "a" "a" "c" "a"
##  [86] "t" "t" "t" "t" "a" "c" "t" "g" "t" "a" "c" "a" "c" "a" "t" "t" "a"
## [103] "c" "a" "t" "a" "t" "a" "c" "a" "t" "a" "c" "a" "c" "a" "t" "g" "t"
## [120] "g" "c" "a" "t" "g" "c" "t" "a" "a" "t" "a" "t" "t" "t" "a" "g" "t"
## [137] "c" "-" "t" "c" "t" "c" "c" "t" "t" "g" "t" "a" "a" "a" "t" "a" "t"
## [154] "t" "c" "a" "t" "a" "c" "a" "t" "a" "c" "a" "t" "g" "c" "t" "a" "t"
## [171] "g" "t" "a" "t" "t" "a" "t" "t" "g" "t" "g" "c" "a" "t" "t" "c" "a"
## [188] "t" "t" "t" "a" "t" "t" "t" "t" "c" "c" "a" "t" "a" "c" "g" "a" "t"
## [205] "a" "a" "g" "t" "t" "a" "a" "a" "g" "c" "c" "c" "g" "t" "a" "t" "t"
## [222] "a" "a" "t" "t" "a" "t" "c" "a" "t" "t" "a" "a" "t" "t" "t" "t" "a"
## [239] "c" "a" "t" "a" "t" "t" "a" "c" "a" "t" "a" "a" "t" "a" "t" "g" "c"
## [256] "a" "t" "g" "c" "t" "c" "t" "t" "a" "c" "a" "t" "a" "t" "t" "a" "t"
## [273] "a" "t" "c" "t" "c" "c" "c" "c" "t" "a" "t" "c" "a" "a" "t" "t" "t"
## [290] "c" "a" "c" "c" "t" "c" "c" "a" "t" "t" "a" "t" "a" "c" "c" "c" "t"
## [307] "a" "t" "g" "g" "t" "c" "a" "c" "t" "c" "c" "a" "t" "t" "a" "g" "a"
## [324] "t" "c" "a" "c" "g" "a" "g" "c" "t" "t" "a" "a" "t" "c" "a" "c" "c"
## [341] "a" "t" "g" "c" "c" "g" "c" "g" "t" "g" "a" "a" "a" "c" "c" "a" "g"
## [358] "c" "a" "a" "c" "c" "c" "g" "c" "t" "c" "g" "g" "c" "a" "g" "g" "g"
## [375] "a" "t" "c" "c" "c" "t" "c" "t" "t" "c" "t" "c" "g" "c" "a" "c" "c"
## [392] "g" "g" "g" "c" "c" "c" "a" "t" "a" "t" "c" "t" "c" "g" "t" "g" "g"
## [409] "g" "g" "g" "t" "n" "n" "n" "n"
x.trimmed <- trimNs(as.DNAbin(x))
as.character(as.list(x.trimmed))
## [[1]]
##   [1] "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a"
##  [18] "c" "a" "g" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c" "a" "t" "c" "a"
##  [35] "c" "a" "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g" "t" "a" "t"
##  [52] "t" "a" "a" "a" "a" "g" "t" "a" "a" "t" "t" "t" "g" "t" "t" "t" "t"
##  [69] "a" "a" "a" "a" "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t" "a"
##  [86] "c" "a" "c" "a" "t" "t" "a" "c" "a" "t" "a" "t" "a" "c" "a" "t" "a"
## [103] "c" "a" "c" "a" "t" "g" "t" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t"
## [120] "a" "t" "t" "t" "a" "g" "t" "c" "-" "t" "c" "t" "c" "c" "t" "t" "g"
## [137] "t" "a" "a" "a" "t" "a" "t" "t" "c" "a" "t" "a" "c" "a" "t" "a" "c"
## [154] "a" "t" "g" "c" "t" "a" "t" "g" "t" "a" "t" "t" "a" "t" "t" "g" "t"
## [171] "g" "c" "a" "t" "t" "c" "a" "t" "t" "t" "a" "t" "t" "t" "t" "c" "c"
## [188] "a" "t" "a" "c" "g" "a" "t" "a" "a" "g" "t" "t" "a" "a" "a" "g" "c"
## [205] "c" "c" "g" "t" "a" "t" "t" "a" "a" "t" "t" "a" "t" "c" "a" "t" "t"
## [222] "a" "a" "t" "t" "t" "t" "a" "c" "a" "t" "a" "t" "t" "a" "c" "a" "t"
## [239] "a" "a" "t" "a" "t" "g" "c" "a" "t" "g" "c" "t" "c" "t" "t" "a" "c"
## [256] "a" "t" "a" "t" "t" "a" "t" "a" "t" "c" "t" "c" "c" "c" "c" "t" "a"
## [273] "t" "c" "a" "a" "t" "t" "t" "c" "a" "c" "c" "t" "c" "c" "a" "t" "t"
## [290] "a" "t" "a" "c" "c" "c" "t" "a" "t" "g" "g" "t" "c" "a" "c" "t" "c"
## [307] "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g" "a" "g" "c" "t" "t"
## [324] "a" "a" "t" "c" "a" "c" "c" "a" "t" "g" "c" "c" "g" "c" "g" "t" "g"
## [341] "a" "a" "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g" "c" "t" "c"
## [358] "g" "g" "c" "a" "g" "g" "g" "a" "t" "c" "c" "c" "t" "c" "t" "t" "c"
## [375] "t" "c" "g" "c" "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "a" "t"
## [392] "c" "t" "c" "g" "t" "g" "g" "g" "g" "g" "t"

Base frequencies for a sequence are calculated with the baseFreqs function:

bf <- baseFreqs(dolph.seqs)
bf$site.freqs[, 1:8]
##     1   2   3   4   5   6 7   8
## a   0 126 126 126 126 126 5   0
## c   0   0   0   0   0   0 0   0
## g 126   0   0   0   0   0 0 126
## t   0   0   0   0   0   0 0   0
## u   0   0   0   0   0   0 0   0
## r   0   0   0   0   0   0 0   0
## y   0   0   0   0   0   0 0   0
## m   0   0   0   0   0   0 0   0
## k   0   0   0   0   0   0 0   0
## w   0   0   0   0   0   0 0   0
## s   0   0   0   0   0   0 0   0
## b   0   0   0   0   0   0 0   0
## d   0   0   0   0   0   0 0   0
## h   0   0   0   0   0   0 0   0
## v   0   0   0   0   0   0 0   0
## n   0   0   0   0   0   0 0   0
## x   0   0   0   0   0   0 0   0
## -   0   0   0   0   0   0 0   0
## .   0   0   0   0   0   0 0   0
bf$base.freqs
## 
##      a      c      g      t      u      r      y      m      k      w 
## 0.2997 0.2282 0.1283 0.3389 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
##      s      b      d      h      v      n      x      -      . 
## 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0048 0.0000

One can also identify which sites are fixed and which are variable:

fs <- fixedSites(dolph.seqs)
fs[1:20]
##   1   2   3   4   5   6   8   9  10  11  12  13  14  15  16  17  18  19 
## "g" "a" "a" "a" "a" "a" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c" "a" 
##  21  22 
## "t" "t"
vs <- variableSites(dolph.seqs)
vs
## $sites
## 126 DNA sequences in binary format stored in a matrix.
## 
## All sequences of same length: 41 
## 
## Labels:
## 4495
## 4496
## 4498
## 5814
## 5815
## 5816
## ...
## 
## Base composition:
##     a     c     g     t 
## 0.206 0.360 0.088 0.347 
## 
## $site.freqs
##    20  32  57  92  97  99 101 104 106 109 149 150 151 205 245 248 265 269
## a   2   0   1 124   0   0   0 125 124   0   0 124   0   0   0   2   2   0
## c   0  10   0   0   7 115   6   0   0  12 112   0   2 102 114   0  31  99
## g 124   0 125   2   0   0   0   1   2   0   0   2   0   0   0 124   0   0
## t   0 116   0   0 119  11 120   0   0 114  14   0 124  24  12   0  93  27
## -   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   272 274 275 278 279 280 282 283 287 293 294 302 303 305 329 357 370 373
## a 123   1 123   0   0   0   0   0 125   0   0   0  78   0   0   0   0   0
## c   0 124   0  10  14   2  98  97   0  84  97 124   0   4 112 124  77   4
## g   3   0   3   0   0   0   0   0   1   0   0   0  48   0   0   0   0   0
## t   0   1   0 116 112 124  28  29   0  42  29   2   0 122  14   2  49 122
## -   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   390 391 392 393 394
## a 108   0   0   0   0
## c   0  27 125   1 125
## g  18   0   0   0   0
## t   0  99   1 125   1
## -   0   0   0   0   0

Both functions take an optional set of bases to consider when evaluating whether a site is fixed or variable. For fixedSites, the function will only count those sites that are fixed in the listed bases argument. For variableSites the site is considered variable if it has those bases and is not fixed for them:

fs <- fixedSites(dolph.seqs, bases = c("c", "t"))
fs[1:20]
##   9  10  11  13  14  16  18  21  22  24  25  27  30  33  35  38  40  41 
## "c" "t" "t" "t" "t" "t" "c" "t" "t" "c" "c" "c" "c" "c" "c" "t" "c" "t" 
##  43  45 
## "c" "t"
vs <- variableSites(dolph.seqs, bases = c("c", "t"))
vs
## $sites
## 126 DNA sequences in binary format stored in a matrix.
## 
## All sequences of same length: 29 
## 
## Labels:
## 4495
## 4496
## 4498
## 5814
## 5815
## 5816
## ...
## 
## Base composition:
##     a     c     g     t 
## 0.001 0.509 0.000 0.490 
## 
## $site.freqs
##    32  97  99 101 109 149 151 205 245 265 269 274 278 279 280 282 283 293
## c  10   7 115   6  12 112   2 102 114  31  99 124  10  14   2  98  97  84
## t 116 119  11 120 114  14 124  24  12  93  27   1 116 112 124  28  29  42
##   294 302 305 329 357 370 373 391 392 393 394
## c  97 124   4 112 124  77   4  27 125   1 125
## t  29   2 122  14   2  49 122  99   1 125   1

There are also functions to compare bases against IUPAC ambiguity codes. One can calculate the appropriate IUPAC code for a vector of nucleotides:

iupacCode(c("c", "t", "t", "c", "c"))
## [1] "y"
iupacCode(c("c", "t", "a", "c", "c"))
## [1] "h"
iupacCode(c("g", "t", "a", "c", "c"))
## [1] "n"

One can also calculate all IUPAC codes that apply to a vector of nucleotides:

validIupacCodes(c("c", "t", "t", "c", "c"))
## [1] "y" "b" "h" "n" "x" "-" "."
validIupacCodes(c("c", "t", "a", "c", "c"))
## [1] "h" "n" "x" "-" "."
validIupacCodes(c("g", "t", "a", "c", "c"))
## [1] "n" "x" "-" "."

A consensus sequence can also be easily generated:

createConsensus(dolph.seqs)
##   [1] "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a"
##  [18] "c" "a" "r" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c" "a" "y" "c" "a"
##  [35] "c" "a" "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g" "t" "a" "t"
##  [52] "t" "a" "a" "a" "a" "r" "t" "a" "a" "t" "t" "t" "g" "t" "t" "t" "t"
##  [69] "a" "a" "a" "a" "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t" "a"
##  [86] "c" "a" "c" "a" "t" "t" "r" "c" "a" "t" "a" "y" "a" "y" "a" "y" "a"
## [103] "c" "r" "c" "r" "t" "g" "y" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t"
## [120] "a" "t" "t" "t" "a" "g" "t" "c" "-" "t" "c" "t" "c" "c" "t" "t" "g"
## [137] "t" "a" "a" "a" "t" "a" "t" "t" "c" "a" "t" "a" "y" "r" "y" "a" "c"
## [154] "a" "t" "g" "c" "t" "a" "t" "g" "t" "a" "t" "t" "a" "t" "t" "g" "t"
## [171] "g" "c" "a" "t" "t" "c" "a" "t" "t" "t" "a" "t" "t" "t" "t" "c" "c"
## [188] "a" "t" "a" "c" "g" "a" "t" "a" "a" "g" "t" "t" "a" "a" "a" "g" "c"
## [205] "y" "c" "g" "t" "a" "t" "t" "a" "a" "t" "t" "a" "t" "c" "a" "t" "t"
## [222] "a" "a" "t" "t" "t" "t" "a" "c" "a" "t" "a" "t" "t" "a" "c" "a" "t"
## [239] "a" "a" "t" "a" "t" "g" "y" "a" "t" "r" "c" "t" "c" "t" "t" "a" "c"
## [256] "a" "t" "a" "t" "t" "a" "t" "a" "t" "h" "t" "c" "c" "y" "c" "t" "r"
## [273] "t" "h" "r" "a" "t" "y" "y" "y" "a" "y" "y" "t" "c" "c" "r" "t" "t"
## [290] "a" "t" "a" "y" "y" "c" "t" "a" "t" "g" "g" "t" "y" "r" "c" "y" "c"
## [307] "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g" "a" "g" "c" "t" "t"
## [324] "a" "a" "t" "c" "a" "y" "c" "a" "t" "g" "c" "c" "g" "c" "g" "t" "g"
## [341] "a" "a" "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g" "c" "t" "y"
## [358] "g" "g" "c" "a" "g" "g" "g" "a" "t" "c" "c" "c" "y" "c" "t" "y" "c"
## [375] "t" "c" "g" "c" "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "r" "y"
## [392] "y" "y" "y" "g" "t" "g" "g" "g" "g" "g" "t"

Nucleotide diversity for each site is calculaed with:

nucleotideDiversity(dolph.seqs)
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##    13    14    15    16    17    18    19    20    21    22    23    24 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.031 0.000 0.000 0.000 0.000 
##    25    26    27    28    29    30    31    32    33    34    35    36 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.147 0.000 0.000 0.000 0.000 
##    37    38    39    40    41    42    43    44    45    46    47    48 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##    49    50    51    52    53    54    55    56    57    58    59    60 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.016 0.000 0.000 0.000 
##    61    62    63    64    65    66    67    68    69    70    71    72 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##    73    74    75    76    77    78    79    80    81    82    83    84 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##    85    86    87    88    89    90    91    92    93    94    95    96 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.031 0.000 0.000 0.000 0.000 
##    97    98    99   100   101   102   103   104   105   106   107   108 
## 0.106 0.000 0.161 0.000 0.091 0.000 0.000 0.016 0.000 0.031 0.000 0.000 
##   109   110   111   112   113   114   115   116   117   118   119   120 
## 0.174 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   121   122   123   124   125   126   127   128   129   130   131   132 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   133   134   135   136   137   138   139   140   141   142   143   144 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   145   146   147   148   149   150   151   152   153   154   155   156 
## 0.000 0.000 0.000 0.000 0.199 0.031 0.031 0.000 0.000 0.000 0.000 0.000 
##   157   158   159   160   161   162   163   164   165   166   167   168 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   169   170   171   172   173   174   175   176   177   178   179   180 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   181   182   183   184   185   186   187   188   189   190   191   192 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   193   194   195   196   197   198   199   200   201   202   203   204 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   205   206   207   208   209   210   211   212   213   214   215   216 
## 0.311 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   217   218   219   220   221   222   223   224   225   226   227   228 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   229   230   231   232   233   234   235   236   237   238   239   240 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   241   242   243   244   245   246   247   248   249   250   251   252 
## 0.000 0.000 0.000 0.000 0.174 0.000 0.000 0.031 0.000 0.000 0.000 0.000 
##   253   254   255   256   257   258   259   260   261   262   263   264 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   265   266   267   268   269   270   271   272   273   274   275   276 
## 0.398 0.000 0.000 0.000 0.339 0.000 0.000 0.047 0.000 0.032 0.047 0.000 
##   277   278   279   280   281   282   283   284   285   286   287   288 
## 0.000 0.147 0.199 0.031 0.000 0.348 0.357 0.000 0.000 0.000 0.016 0.000 
##   289   290   291   292   293   294   295   296   297   298   299   300 
## 0.000 0.000 0.000 0.000 0.448 0.357 0.000 0.000 0.000 0.000 0.000 0.000 
##   301   302   303   304   305   306   307   308   309   310   311   312 
## 0.000 0.031 0.475 0.000 0.062 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   313   314   315   316   317   318   319   320   321   322   323   324 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   325   326   327   328   329   330   331   332   333   334   335   336 
## 0.000 0.000 0.000 0.000 0.199 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   337   338   339   340   341   342   343   344   345   346   347   348 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   349   350   351   352   353   354   355   356   357   358   359   360 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.031 0.000 0.000 0.000 
##   361   362   363   364   365   366   367   368   369   370   371   372 
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.479 0.000 0.000 
##   373   374   375   376   377   378   379   380   381   382   383   384 
## 0.062 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
##   385   386   387   388   389   390   391   392   393   394   395   396 
## 0.000 0.000 0.000 0.000 0.000 0.247 0.339 0.016 0.016 0.016 0.000 0.000 
##   397   398   399   400   401   402 
## 0.000 0.000 0.000 0.000 0.000 0.000

For a stratified gtypes object, one can calculate net nucleotide divergence (Nei’s dA), and distributions of between- and within-strata divergence:

# create gtypes
data(dolph.seqs)
data(dolph.strata)
dloop <- df2gtypes(dolph.strata[, c("id", "fine", "id")], ploidy = 1,
             schemes = dolph.strata[, c("fine", "broad")], sequences = dolph.seqs)
dloop <- labelHaplotypes(dloop, "Hap.")$gtypes

# calculate divergence
nucleotideDivergence(dloop)
## $id.1
## $id.1$within
##                  mean pct.0 pct.0.025 pct.0.5 pct.0.975 pct.1
## Coastal        0.0051     0         0   0.005     0.010 0.010
## Offshore.North 0.0227     0         0   0.022     0.037 0.050
## Offshore.South 0.0187     0         0   0.018     0.038 0.043
## 
## $id.1$between
##         strata.1       strata.2     dA  mean  pct.0 pct.0.025 pct.0.5
## 1        Coastal Offshore.North 0.0061 0.020 0.0000    0.0050   0.020
## 2        Coastal Offshore.South 0.0062 0.018 0.0075    0.0075   0.018
## 3 Offshore.North Offshore.South 0.0011 0.022 0.0000    0.0000   0.022
##   pct.0.975 pct.1
## 1     0.033 0.037
## 2     0.033 0.037
## 3     0.040 0.048

For stratified gtypes, one can also identify fixed differences between strata:

fixedDifferences(dloop)
## $sites
## $sites$`Coastal v. Offshore.North`
##               
## Coastal       
## Offshore.North
## 
## $sites$`Coastal v. Offshore.South`
##               
## Coastal       
## Offshore.South
## 
## $sites$`Offshore.North v. Offshore.South`
##               
## Offshore.North
## Offshore.South
## 
## 
## $num.fixed
##         strata.1       strata.2 num.fixed
## 1        Coastal Offshore.North         0
## 2        Coastal Offshore.South         0
## 3 Offshore.North Offshore.South         0

Two functions have been provided to select a subset of representative sequences. The first selects the most distant sequences in order to capture the full distribution of variation. For example:

x <- as.DNAbin(dolph.seqs)
mostDistantSequences(x, num.seqs = 5)
## [1] "74962" "6290"  "74963" "18652" "50746"

The other function selects the most representative sequences by first clustering the sequences and selecting the sequences closest to the center of each cluster:

mostRepresentativeSequences(x, num.seqs = 5)
## [1] "6153"  "18655" "26305" "78044" "78055"