as_tokenizer.c 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include "as_tokenizer.h"
  5. #include "as_op.h"
  6. #include "utils.h"
  7. int InputStream_nextChar(InputStream s) {
  8. if (s->cursor == -1) {
  9. return EOF;
  10. }
  11. if (s->buf_pos == s->cursor) {
  12. size_t n = fread(s->buf, 1, INPUT_STREAM_BUF_SIZE, s->fp);
  13. if (n == 0) {
  14. s->cursor = -1;
  15. return EOF;
  16. }
  17. s->buf_pos = n;
  18. s->cursor = 0;
  19. }
  20. int c = s->buf[s->cursor];
  21. s->cursor++;
  22. if (c == '\n') {
  23. s->line++;
  24. s->col = 1;
  25. } else {
  26. s->col++;
  27. }
  28. return c;
  29. }
  30. int InputStream_peekChar(InputStream s) {
  31. if (s->cursor == -1) {
  32. return EOF;
  33. }
  34. if (s->buf_pos == s->cursor) {
  35. size_t n = fread(s->buf, 1, INPUT_STREAM_BUF_SIZE, s->fp);
  36. if (n == 0) {
  37. return EOF;
  38. }
  39. s->buf_pos = n;
  40. s->cursor = 0;
  41. }
  42. return s->buf[s->cursor];
  43. }
  44. int isStartOfIndentifier(int c) {
  45. if (c >= 'a' && c <= 'z') {
  46. return 1;
  47. }
  48. if (c >= 'A' && c <= 'Z') {
  49. return 1;
  50. }
  51. if (c == '_') {
  52. return 1;
  53. }
  54. return 0;
  55. }
  56. int isPartOfIndentifier(int c) {
  57. if (isStartOfIndentifier(c)) {
  58. return 1;
  59. }
  60. if (c >= '0' && c <= '9') {
  61. return 1;
  62. }
  63. return 0;
  64. }
  65. Token nextTokenImpl(Allocator alct, InputStream s) {
  66. Token t = allocate(alct, sizeof(struct token));
  67. int c;
  68. while (1) {
  69. c = InputStream_peekChar(s);
  70. if (c == EOF) {
  71. break;
  72. }
  73. if (c == '\n') {
  74. InputStream_nextChar(s);
  75. *t = (struct token){.type = NEWLINE, .line = s->line, .col = s->col};
  76. return t;
  77. }
  78. if (c == ':') {
  79. InputStream_nextChar(s);
  80. *t = (struct token){.type = COLON, .line = s->line, .col = s->col};
  81. return t;
  82. }
  83. if (c == ' ' || c == '\t') {
  84. InputStream_nextChar(s);
  85. continue;
  86. }
  87. if (c >= '0' && c <= '9') {
  88. int64_t ival = 0;
  89. while (1) {
  90. InputStream_nextChar(s);
  91. ival = ival * 10 + (c - '0');
  92. c = InputStream_peekChar(s);
  93. if (c < '0' || c > '9') {
  94. break;
  95. }
  96. }
  97. *t = (struct token){.type = ARG, .ival = ival, .line = s->line, .col = s->col};
  98. return t;
  99. }
  100. if (isStartOfIndentifier(c)) {
  101. size_t line = s->line;
  102. size_t col = s->col;
  103. char *sval = allocate(alct, 256);
  104. size_t i = 0;
  105. while(1) {
  106. if (i >= 255) {
  107. fprintf(stderr, "error: identifier too long\n");
  108. exit(1);
  109. }
  110. InputStream_nextChar(s);
  111. sval[i++] = c;
  112. c = InputStream_peekChar(s);
  113. if (!isPartOfIndentifier(c)) {
  114. break;
  115. }
  116. }
  117. sval[i] = '\0';
  118. if (isOp(sval)) {
  119. *t = (struct token){.type = OP, .sval = sval, .line = line, .col = col};
  120. return t;
  121. }
  122. *t = (struct token){.type = TAG, .sval = sval, .line = line, .col = col};
  123. return t;
  124. }
  125. fprintf(stderr, "error: invalid character %c at line %d, col %d\n", c, s->line, s->col);
  126. }
  127. // end of file
  128. *t = (struct token){.type = ENDOFFILE};
  129. return t;
  130. }
  131. Token nextToken(Allocator alct, TokenStream ts) {
  132. if (ts->buf != NULL) {
  133. Token t = ts->buf;
  134. ts->buf = NULL;
  135. return t;
  136. }
  137. Token t = nextTokenImpl(alct, ts->s);
  138. return t;
  139. }
  140. Token peekToken(Allocator alct, TokenStream ts) {
  141. if (ts->buf != NULL) {
  142. return ts->buf;
  143. }
  144. ts->buf = nextTokenImpl(alct, ts->s);
  145. return ts->buf;
  146. }
  147. void printToken(Token t) {
  148. switch (t->type) {
  149. case OP:
  150. printf("OP: %s, line: %d, col: %d\n", t->sval, t->line, t->col);
  151. break;
  152. case ARG:
  153. printf("ARG: %ld, line: %d, col: %d\n", t->ival, t->line, t->col);
  154. break;
  155. case TAG:
  156. printf("LABEL: %s, line: %d, col: %d\n", t->sval, t->line, t->col);
  157. break;
  158. case COLON:
  159. printf("COLON\n");
  160. break;
  161. case NEWLINE:
  162. printf("NEWLINE\n");
  163. break;
  164. case ENDOFFILE:
  165. printf("ENDOFFILE\n");
  166. break;
  167. }
  168. }
  169. TokenStream makeTokenStream(Allocator alct, FILE* fp) {
  170. InputStream s = allocate(alct, sizeof(struct inputStream));
  171. s->fp = fp;
  172. s->buf = allocate(alct, INPUT_STREAM_BUF_SIZE);
  173. s->buf_pos = 0;
  174. s->cursor = 0;
  175. s->line = 1;
  176. s->col = 1;
  177. TokenStream ts = allocate(alct, sizeof(struct tokenStream));
  178. ts->s = s;
  179. ts->buf = NULL;
  180. return ts;
  181. }