You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

230 lines
5.0 KiB

3 years ago
3 years ago
3 years ago
  1. from collections import defaultdict
  2. from typing import Dict, List, Type
  3. from pydantic import BaseModel
  4. from typing_extensions import Literal
  5. from ...models import DOCUMENT_CLASSIFICATION, SEQ2SEQ, SEQUENCE_LABELING
  6. from . import examples
  7. encodings = Literal[
  8. 'Auto',
  9. 'ascii',
  10. 'big5',
  11. 'big5hkscs',
  12. 'cp037',
  13. 'cp273',
  14. 'cp424',
  15. 'cp437',
  16. 'cp500',
  17. 'cp720',
  18. 'cp737',
  19. 'cp775',
  20. 'cp850',
  21. 'cp852',
  22. 'cp855',
  23. 'cp856',
  24. 'cp857',
  25. 'cp858',
  26. 'cp860',
  27. 'cp861',
  28. 'cp862',
  29. 'cp863',
  30. 'cp864',
  31. 'cp865',
  32. 'cp866',
  33. 'cp869',
  34. 'cp874',
  35. 'cp875',
  36. 'cp932',
  37. 'cp949',
  38. 'cp950',
  39. 'cp1006',
  40. 'cp1026',
  41. 'cp1125',
  42. 'cp1140',
  43. 'cp1250',
  44. 'cp1251',
  45. 'cp1252',
  46. 'cp1253',
  47. 'cp1254',
  48. 'cp1255',
  49. 'cp1256',
  50. 'cp1257',
  51. 'cp1258',
  52. 'cp65001',
  53. 'euc_jp',
  54. 'euc_jis_2004',
  55. 'euc_jisx0213',
  56. 'euc_kr',
  57. 'gb2312',
  58. 'gbk',
  59. 'gb18030',
  60. 'hz',
  61. 'iso2022_jp',
  62. 'iso2022_jp_1',
  63. 'iso2022_jp_2',
  64. 'iso2022_jp_2004',
  65. 'iso2022_jp_3',
  66. 'iso2022_jp_ext',
  67. 'iso2022_kr',
  68. 'latin_1',
  69. 'iso8859_2',
  70. 'iso8859_3',
  71. 'iso8859_4',
  72. 'iso8859_5',
  73. 'iso8859_6',
  74. 'iso8859_7',
  75. 'iso8859_8',
  76. 'iso8859_9',
  77. 'iso8859_10',
  78. 'iso8859_11',
  79. 'iso8859_13',
  80. 'iso8859_14',
  81. 'iso8859_15',
  82. 'iso8859_16',
  83. 'johab',
  84. 'koi8_r',
  85. 'koi8_t',
  86. 'koi8_u',
  87. 'kz1048',
  88. 'mac_cyrillic',
  89. 'mac_greek',
  90. 'mac_iceland',
  91. 'mac_latin2',
  92. 'mac_roman',
  93. 'mac_turkish',
  94. 'ptcp154',
  95. 'shift_jis',
  96. 'shift_jis_2004',
  97. 'shift_jisx0213',
  98. 'utf_32',
  99. 'utf_32_be',
  100. 'utf_32_le',
  101. 'utf_16',
  102. 'utf_16_be',
  103. 'utf_16_le',
  104. 'utf_7',
  105. 'utf_8',
  106. 'utf_8_sig'
  107. ]
  108. class Format:
  109. name = ''
  110. accept_types = ''
  111. @classmethod
  112. def dict(cls):
  113. return {
  114. 'name': cls.name,
  115. 'accept_types': cls.accept_types
  116. }
  117. class CSV(Format):
  118. name = 'CSV'
  119. accept_types = 'text/csv'
  120. class FastText(Format):
  121. name = 'fastText'
  122. accept_types = 'text/plain'
  123. class JSON(Format):
  124. name = 'JSON'
  125. accept_types = 'application/json'
  126. class JSONL(Format):
  127. name = 'JSONL'
  128. accept_types = '*'
  129. class Excel(Format):
  130. name = 'Excel'
  131. accept_types = 'application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
  132. class TextFile(Format):
  133. name = 'TextFile'
  134. accept_types = 'text/*'
  135. class TextLine(Format):
  136. name = 'TextLine'
  137. accept_types = 'text/*'
  138. class CoNLL(Format):
  139. name = 'CoNLL'
  140. accept_types = 'text/*'
  141. class OptionColumn(BaseModel):
  142. encoding: encodings = 'utf_8'
  143. column_data: str = 'text'
  144. column_label: str = 'label'
  145. class OptionDelimiter(OptionColumn):
  146. encoding: encodings = 'utf_8'
  147. delimiter: Literal[',', '\t', ';', '|', ' '] = ','
  148. class OptionNone(BaseModel):
  149. encoding: encodings = 'utf_8'
  150. class OptionCoNLL(BaseModel):
  151. encoding: encodings = 'utf_8'
  152. scheme: Literal['IOB2', 'IOE2', 'IOBES', 'BILOU'] = 'IOB2'
  153. delimiter: Literal[' ', ''] = ' '
  154. class Options:
  155. options: Dict[str, List] = defaultdict(list)
  156. @classmethod
  157. def filter_by_task(cls, task_name: str):
  158. options = cls.options[task_name]
  159. return [
  160. {
  161. **format.dict(),
  162. **option.schema(),
  163. 'example': example
  164. } for format, option, example in options
  165. ]
  166. @classmethod
  167. def register(cls,
  168. task: str,
  169. format: Type[Format],
  170. option: Type[BaseModel],
  171. example: str):
  172. cls.options[task].append((format, option, example))
  173. # Text Classification
  174. Options.register(DOCUMENT_CLASSIFICATION, TextFile, OptionNone, examples.Generic_TextFile)
  175. Options.register(DOCUMENT_CLASSIFICATION, TextLine, OptionNone, examples.Generic_TextLine)
  176. Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
  177. Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionNone, examples.Category_fastText)
  178. Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionColumn, examples.Category_JSON)
  179. Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionColumn, examples.Category_JSONL)
  180. Options.register(DOCUMENT_CLASSIFICATION, Excel, OptionColumn, examples.Category_CSV)
  181. # Sequence Labeling
  182. Options.register(SEQUENCE_LABELING, TextFile, OptionNone, examples.Generic_TextFile)
  183. Options.register(SEQUENCE_LABELING, TextLine, OptionNone, examples.Generic_TextLine)
  184. Options.register(SEQUENCE_LABELING, JSONL, OptionColumn, examples.Offset_JSONL)
  185. Options.register(SEQUENCE_LABELING, CoNLL, OptionCoNLL, examples.Offset_CoNLL)
  186. # Sequence to sequence
  187. Options.register(SEQ2SEQ, TextFile, OptionNone, examples.Generic_TextFile)
  188. Options.register(SEQ2SEQ, TextLine, OptionNone, examples.Generic_TextLine)
  189. Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
  190. Options.register(SEQ2SEQ, JSON, OptionColumn, examples.Text_JSON)
  191. Options.register(SEQ2SEQ, JSONL, OptionColumn, examples.Text_JSONL)
  192. Options.register(SEQ2SEQ, Excel, OptionColumn, examples.Text_CSV)