1  """CSS Selectors based on XPath. 
  2   
  3  This module supports selecting XML/HTML tags based on CSS selectors. 
  4  See the `CSSSelector` class for details. 
  5  """ 
  6   
  7  import re 
  8  from lxml import etree 
  9   
 10  __all__ = ['SelectorSyntaxError', 'ExpressionError', 
 11             'CSSSelector'] 
 12   
 15   
 18   
 20      """A CSS selector. 
 21   
 22      Usage:: 
 23   
 24          >>> from lxml import etree, cssselect 
 25          >>> select = cssselect.CSSSelector("a tag > child") 
 26   
 27          >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") 
 28          >>> [ el.tag for el in select(root) ] 
 29          ['child'] 
 30      """ 
 35   
 37          return '<%s %s for %r>' % ( 
 38              self.__class__.__name__, 
 39              hex(abs(id(self)))[2:], 
 40              self.css) 
   41   
 42   
 43   
 44   
 47          obj = unicode.__new__(cls, contents) 
 48          obj.pos = pos 
 49          return obj 
  50           
 52          return '%s(%s, %r)' % ( 
 53              self.__class__.__name__, 
 54              unicode.__repr__(self), 
 55              self.pos) 
  74      """ 
 75      Represents selector.class_name 
 76      """ 
 77   
 78 -    def __init__(self, selector, class_name): 
  79          self.selector = selector 
 80          self.class_name = class_name 
  81   
 83          return '%s[%r.%s]' % ( 
 84              self.__class__.__name__, 
 85              self.selector, 
 86              self.class_name) 
  87   
 89          sel_xpath = self.selector.xpath() 
 90          sel_xpath.add_condition( 
 91              "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) 
 92          return sel_xpath 
   93   
 95      """ 
 96      Represents selector:name(expr) 
 97      """ 
 98   
 99      unsupported = [ 
100          'target', 'lang', 'enabled', 'disabled',] 
101   
102 -    def __init__(self, selector, type, name, expr): 
 103          self.selector = selector 
104          self.type = type 
105          self.name = name 
106          self.expr = expr 
 107   
109          return '%s[%r%s%s(%r)]' % ( 
110              self.__class__.__name__, 
111              self.selector, 
112              self.type, self.name, self.expr) 
 113   
115          sel_path = self.selector.xpath() 
116          if self.name in self.unsupported: 
117              raise ExpressionError( 
118                  "The psuedo-class %r is not supported" % self.name) 
119          method = '_xpath_' + self.name.replace('-', '_') 
120          if not hasattr(self, method): 
121              raise ExpressionError( 
122                  "The psuedo-class %r is unknown" % self.name) 
123          method = getattr(self, method) 
124          return method(sel_path, self.expr) 
 125   
128          a, b = parse_series(expr) 
129          if not a and not b and not last: 
130               
131              xpath.add_condition('false() and position() = 0') 
132              return xpath 
133          if add_name_test: 
134              xpath.add_name_test() 
135          xpath.add_star_prefix() 
136          if a == 0: 
137              if last: 
138                  b = 'last() - %s' % b 
139              xpath.add_condition('position() = %s' % b) 
140              return xpath 
141          if last: 
142               
143              a = -a 
144              b = -b 
145          if b > 0: 
146              b_neg = str(-b) 
147          else: 
148              b_neg = '+%s' % (-b) 
149          if a != 1: 
150              expr = ['(position() %s) mod %s = 0' % (b_neg, a)] 
151          else: 
152              expr = [] 
153          if b >= 0: 
154              expr.append('position() >= %s' % b) 
155          elif b < 0 and last: 
156              expr.append('position() < (last() %s)' % b) 
157          expr = ' and '.join(expr) 
158          if expr: 
159              xpath.add_condition(expr) 
160          return xpath 
 161           
162           
163           
164           
165           
166           
167           
168   
170          return self._xpath_nth_child(xpath, expr, last=True) 
 171   
173          if xpath.element == '*': 
174              raise NotImplementedError( 
175                  "*:nth-of-type() is not implemented") 
176          return self._xpath_nth_child(xpath, expr, add_name_test=False) 
 177   
179          return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) 
 180   
182           
183          if isinstance(expr, Element): 
184              expr = expr._format_element() 
185          xpath.add_condition('contains(css:lower-case(string(.)), %s)' 
186                              % xpath_repr(expr.lower())) 
187           
188          return xpath 
 189   
191           
192          expr = expr.xpath() 
193          cond = expr.condition 
194           
195          xpath.add_condition('not(%s)' % cond) 
196          return xpath 
  197   
200   
201  ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') 
202  ns.prefix = 'css' 
203  ns['lower-case'] = _make_lower_case 
204   
206      """ 
207      Represents selector:ident 
208      """ 
209   
210      unsupported = ['indeterminate', 'first-line', 'first-letter', 
211                     'selection', 'before', 'after', 'link', 'visited', 
212                     'active', 'focus', 'hover'] 
213   
214 -    def __init__(self, element, type, ident): 
 215          self.element = element 
216          assert type in (':', '::') 
217          self.type = type 
218          self.ident = ident 
 219   
221          return '%s[%r%s%s]' % ( 
222              self.__class__.__name__, 
223              self.element, 
224              self.type, self.ident) 
 225   
227          el_xpath = self.element.xpath() 
228          if self.ident in self.unsupported: 
229              raise ExpressionError( 
230                  "The psuedo-class %r is unsupported" % self.ident) 
231          method = '_xpath_' + self.ident.replace('-', '_') 
232          if not hasattr(self, method): 
233              raise ExpressionError( 
234                  "The psuedo-class %r is unknown" % self.ident) 
235          method = getattr(self, method) 
236          el_xpath = method(el_xpath) 
237          return el_xpath 
 238   
240           
241          xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") 
242          return xpath 
 243   
245           
246          raise NotImplementedError 
 247   
249          xpath.add_star_prefix() 
250          xpath.add_name_test() 
251          xpath.add_condition('position() = 1') 
252          return xpath 
 253   
255          xpath.add_star_prefix() 
256          xpath.add_name_test() 
257          xpath.add_condition('position() = last()') 
258          return xpath 
 259   
261          if xpath.element == '*': 
262              raise NotImplementedError( 
263                  "*:first-of-type is not implemented") 
264          xpath.add_star_prefix() 
265          xpath.add_condition('position() = 1') 
266          return xpath 
 267   
269          if xpath.element == '*': 
270              raise NotImplementedError( 
271                  "*:last-of-type is not implemented") 
272          xpath.add_star_prefix() 
273          xpath.add_condition('position() = last()') 
274          return xpath 
 275   
277          xpath.add_name_test() 
278          xpath.add_star_prefix() 
279          xpath.add_condition('last() = 1') 
280          return xpath 
 281   
283          if xpath.element == '*': 
284              raise NotImplementedError( 
285                  "*:only-of-type is not implemented") 
286          xpath.add_condition('last() = 1') 
287          return xpath 
 288   
290          xpath.add_condition("not(*) and not(normalize-space())") 
291          return xpath 
  292   
294      """ 
295      Represents selector[namespace|attrib operator value] 
296      """ 
297   
298 -    def __init__(self, selector, namespace, attrib, operator, value): 
 299          self.selector = selector 
300          self.namespace = namespace 
301          self.attrib = attrib 
302          self.operator = operator 
303          self.value = value 
 304   
306          if self.operator == 'exists': 
307              return '%s[%r[%s]]' % ( 
308                  self.__class__.__name__, 
309                  self.selector, 
310                  self._format_attrib()) 
311          else: 
312              return '%s[%r[%s %s %r]]' % ( 
313                  self.__class__.__name__, 
314                  self.selector, 
315                  self._format_attrib(), 
316                  self.operator, 
317                  self.value) 
 318   
324   
326           
327          if self.namespace == '*': 
328              return '@' + self.attrib 
329          else: 
330              return '@%s:%s' % (self.namespace, self.attrib) 
 331   
333          path = self.selector.xpath() 
334          attrib = self._xpath_attrib() 
335          value = self.value 
336          if self.operator == 'exists': 
337              assert not value 
338              path.add_condition(attrib) 
339          elif self.operator == '=': 
340              path.add_condition('%s = %s' % (attrib, 
341                                              xpath_repr(value))) 
342          elif self.operator == '!=': 
343               
344              if value: 
345                  path.add_condition('not(%s) or %s != %s' 
346                                     % (attrib, attrib, xpath_repr(value))) 
347              else: 
348                  path.add_condition('%s != %s' 
349                                     % (attrib, xpath_repr(value))) 
350               
351          elif self.operator == '~=': 
352              path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) 
353          elif self.operator == '|=': 
354               
355              path.add_condition('%s = %s or starts-with(%s, %s)' % ( 
356                  attrib, xpath_repr(value), 
357                  attrib, xpath_repr(value + '-'))) 
358          elif self.operator == '^=': 
359              path.add_condition('starts-with(%s, %s)' % ( 
360                  attrib, xpath_repr(value))) 
361          elif self.operator == '$=': 
362               
363              path.add_condition('substring(%s, string-length(%s)-%s) = %s' 
364                                 % (attrib, attrib, len(value)-1, xpath_repr(value))) 
365          elif self.operator == '*=': 
366               
367              path.add_condition('contains(%s, %s)' % ( 
368                  attrib, xpath_repr(value))) 
369          else: 
370              assert 0, ("Unknown operator: %r" % self.operator) 
371          return path 
  372   
374      """ 
375      Represents namespace|element 
376      """ 
377   
378 -    def __init__(self, namespace, element): 
 379          self.namespace = namespace 
380          self.element = element 
 381   
383          return '%s[%s]' % ( 
384              self.__class__.__name__, 
385              self._format_element()) 
 386   
392   
394          if self.namespace == '*': 
395              el = self.element.lower() 
396          else: 
397               
398              el = '%s:%s' % (self.namespace, self.element) 
399          return XPathExpr(element=el) 
  400   
402      """ 
403      Represents selector#id 
404      """ 
405   
407          self.selector = selector 
408          self.id = id 
 409   
411          return '%s[%r#%s]' % ( 
412              self.__class__.__name__, 
413              self.selector, self.id) 
 414   
416          path = self.selector.xpath() 
417          path.add_condition('@id = %s' % xpath_repr(self.id)) 
418          return path 
  419   
421   
425          return '%s(%r)' % ( 
426              self.__class__.__name__, 
427              self.items)     
 428   
430          paths = [item.xpath() for item in self.items] 
431          return XPathExprOr(paths) 
  432   
434   
435      _method_mapping = { 
436          ' ': 'descendant', 
437          '>': 'child', 
438          '+': 'direct_adjacent', 
439          '~': 'indirect_adjacent', 
440          } 
441   
442 -    def __init__(self, selector, combinator, subselector): 
 443          assert selector is not None 
444          self.selector = selector 
445          self.combinator = combinator 
446          self.subselector = subselector 
 447   
449          if self.combinator == ' ': 
450              comb = '<followed>' 
451          else: 
452              comb = self.combinator 
453          return '%s[%r %s %r]' % ( 
454              self.__class__.__name__, 
455              self.selector, 
456              comb, 
457              self.subselector) 
 458   
460          if self.combinator not in self._method_mapping: 
461              raise ExpressionError( 
462                  "Unknown combinator: %r" % self.combinator) 
463          method = '_xpath_' + self._method_mapping[self.combinator] 
464          method = getattr(self, method) 
465          path = self.selector.xpath() 
466          return method(path, self.subselector) 
 467   
472       
477   
479           
480          xpath.join('/following-sibling::', sub.xpath()) 
481          xpath.add_name_test() 
482          xpath.add_condition('position() = 1') 
483          return xpath 
 484   
 489   
490   
491   
492   
493  _el_re = re.compile(r'^\w+\s*$') 
494  _id_re = re.compile(r'^(\w*)#(\w+)\s*$') 
495  _class_re = re.compile(r'^(\w*)\.(\w+)\s*$') 
496   
498      if isinstance(css_expr, basestring): 
499          match = _el_re.search(css_expr) 
500          if match is not None: 
501              return '%s%s' % (prefix, match.group(0).strip()) 
502          match = _id_re.search(css_expr) 
503          if match is not None: 
504              return "%s%s[@id = '%s']" % ( 
505                  prefix, match.group(1) or '*', match.group(2)) 
506          match = _class_re.search(css_expr) 
507          if match is not None: 
508              return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( 
509                  prefix, match.group(1) or '*', match.group(2)) 
510          css_expr = parse(css_expr) 
511      expr = css_expr.xpath() 
512      assert expr is not None, ( 
513          "Got None for xpath expression from %s" % repr(css_expr)) 
514      if prefix: 
515          expr.add_prefix(prefix) 
516      return str(expr) 
 517   
519   
520 -    def __init__(self, prefix=None, path=None, element='*', condition=None, 
521                   star_prefix=False): 
 522          self.prefix = prefix 
523          self.path = path 
524          self.element = element 
525          self.condition = condition 
526          self.star_prefix = star_prefix 
 527   
529          path = '' 
530          if self.prefix is not None: 
531              path += str(self.prefix) 
532          if self.path is not None: 
533              path += str(self.path) 
534          path += str(self.element) 
535          if self.condition: 
536              path += '[%s]' % self.condition 
537          return path 
 538   
540          return '%s[%s]' % ( 
541              self.__class__.__name__, self) 
 542   
544          if self.condition: 
545              self.condition = '%s and (%s)' % (self.condition, condition) 
546          else: 
547              self.condition = condition 
 548   
550          if self.path is None: 
551              self.path = self.element 
552          else: 
553              self.path += self.element 
554          self.element = part 
 555   
557          if self.prefix: 
558              self.prefix = prefix + self.prefix 
559          else: 
560              self.prefix = prefix 
 561   
563          if self.element == '*': 
564               
565              return 
566          self.add_condition("name() = %s" % xpath_repr(self.element)) 
567          self.element = '*' 
 568   
570          """ 
571          Adds a /* prefix if there is no prefix.  This is when you need 
572          to keep context's constrained to a single parent. 
573          """ 
574          if self.path: 
575              self.path += '*/' 
576          else: 
577              self.path = '*/' 
578          self.star_prefix = True 
 579   
580 -    def join(self, combiner, other): 
 581          prefix = str(self) 
582          prefix += combiner 
583          path = (other.prefix or '') + (other.path or '') 
584           
585           
586          if other.star_prefix and path == '*/': 
587              path = '' 
588          self.prefix = prefix 
589          self.path = path 
590          self.element = other.element 
591          self.condition = other.condition 
  592   
594      """ 
595      Represents |'d expressions.  Note that unfortunately it isn't 
596      the union, it's the sum, so duplicate elements will appear. 
597      """ 
598   
599 -    def __init__(self, items, prefix=None): 
 600          for item in items: 
601              assert item is not None 
602          self.items = items 
603          self.prefix = prefix 
 604   
606          prefix = self.prefix or '' 
607          return ' | '.join([prefix + str(i) for i in self.items]) 
  608   
610       
611       
612       
613      if isinstance(s, Element): 
614           
615          s = s._format_element() 
616      return repr(str(s)) 
 617   
618   
619   
620   
622      stream = TokenStream(tokenize(string)) 
623      stream.source = string 
624      try: 
625          return parse_selector_group(stream) 
626      except SelectorSyntaxError, e: 
627          e.args = tuple(["%s at %s -> %s" % ( 
628              e, stream.used, list(stream))]) 
629          raise 
 630   
632      result = [] 
633      while 1: 
634          result.append(parse_selector(stream)) 
635          if stream.peek() == ',': 
636              stream.next() 
637          else: 
638              break 
639      if len(result) == 1: 
640          return result[0] 
641      else: 
642          return Or(result) 
 643   
645      result = parse_simple_selector(stream) 
646      while 1: 
647          peek = stream.peek() 
648          if peek == ',' or peek == ')' or peek is None: 
649              return result 
650          if stream.peek() in ('+', '>', '~'): 
651               
652              combinator = stream.next() 
653          else: 
654              combinator = ' ' 
655          next_selector = parse_simple_selector(stream) 
656          result = CombinedSelector(result, combinator, next_selector) 
657      return result 
 658   
660      peek = stream.peek() 
661      if peek != '*' and not isinstance(peek, Symbol): 
662          element = namespace = '*' 
663      else: 
664          next = stream.next() 
665          if next != '*' and not isinstance(next, Symbol): 
666              raise SelectorSyntaxError( 
667                  "Expected symbol, got %r" % next) 
668          if stream.peek() == '|': 
669              namespace = next 
670              stream.next() 
671              element = stream.next() 
672              if element != '*' and not isinstance(next, Symbol): 
673                  raise SelectorSyntaxError( 
674                      "Expected symbol, got %r" % next) 
675          else: 
676              namespace = '*' 
677              element = next 
678      result = Element(namespace, element) 
679      has_hash = False 
680      while 1: 
681          peek = stream.peek() 
682          if peek == '#': 
683              if has_hash: 
684                   
685                   
686                  break 
687              stream.next() 
688              result = Hash(result, stream.next()) 
689              has_hash = True 
690              continue 
691          elif peek == '.': 
692              stream.next() 
693              result = Class(result, stream.next()) 
694              continue 
695          elif peek == '[': 
696              stream.next() 
697              result = parse_attrib(result, stream) 
698              next = stream.next() 
699              if not next == ']': 
700                  raise SelectorSyntaxError( 
701                      "] expected, got %r" % next) 
702              continue 
703          elif peek == ':' or peek == '::': 
704              type = stream.next() 
705              ident = stream.next() 
706              if not isinstance(ident, Symbol): 
707                  raise SelectorSyntaxError( 
708                      "Expected symbol, got %r" % ident) 
709              if stream.peek() == '(': 
710                  stream.next() 
711                  peek = stream.peek() 
712                  if isinstance(peek, String): 
713                      selector = stream.next() 
714                  elif isinstance(peek, Symbol) and is_int(peek): 
715                      selector = int(stream.next()) 
716                  else: 
717                       
718                      selector = parse_simple_selector(stream) 
719                      next = stream.next() 
720                      if not next == ')': 
721                          raise SelectorSyntaxError( 
722                              "Expected ), got %r and %r" 
723                              % (next, selector)) 
724                  result = Function(result, type, ident, selector) 
725              else: 
726                  result = Pseudo(result, type, ident) 
727              continue 
728          else: 
729              break 
730           
731      return result 
 732   
734      try: 
735          int(v) 
736      except ValueError: 
737          return False 
738      else: 
739          return True 
 740   
742      attrib = stream.next() 
743      if stream.peek() == '|': 
744          namespace = attrib 
745          stream.next() 
746          attrib = stream.next() 
747      else: 
748          namespace = '*' 
749      if stream.peek() == ']': 
750          return Attrib(selector, namespace, attrib, 'exists', None) 
751      op = stream.next() 
752      if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): 
753          raise SelectorSyntaxError( 
754              "Operator expected, got %r" % op) 
755      value = stream.next() 
756      if not isinstance(value, (Symbol, String)): 
757          raise SelectorSyntaxError( 
758              "Expected string or symbol, got %r" % value) 
759      return Attrib(selector, namespace, attrib, op, value) 
 760   
762      """ 
763      Parses things like '1n+2', or 'an+b' generally, returning (a, b) 
764      """ 
765      if isinstance(s, Element): 
766          s = s._format_element() 
767      if not s or s == '*': 
768           
769          return (0, 0) 
770      if isinstance(s, int): 
771           
772          return (0, s) 
773      if s == 'odd': 
774          return (2, 1) 
775      elif s == 'even': 
776          return (2, 0) 
777      elif s == 'n': 
778          return (1, 0) 
779      if 'n' not in s: 
780           
781          return (0, int(s)) 
782      a, b = s.split('n', 1) 
783      if not a: 
784          a = 1 
785      elif a == '-' or a == '+': 
786          a = int(a+'1') 
787      else: 
788          a = int(a) 
789      if not b: 
790          b = 0 
791      elif b == '-' or b == '+': 
792          b = int(b+'1') 
793      else: 
794          b = int(b) 
795      return (a, b) 
 796       
797   
798   
799   
800   
801   
802  _whitespace_re = re.compile(r'\s+') 
803   
804  _comment_re = re.compile(r'/\*.*?\*/', re.S) 
805   
806  _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') 
807   
809      pos = 0 
810      s = _comment_re.sub('', s) 
811      while 1: 
812          match = _whitespace_re.match(s, pos=pos) 
813          if match: 
814              pos = match.end() 
815          if pos >= len(s): 
816              return 
817          match = _count_re.match(s, pos=pos) 
818          if match and match.group() != 'n': 
819              sym = s[pos:match.end()] 
820              yield Symbol(sym, pos) 
821              pos = match.end() 
822              continue 
823          c = s[pos] 
824          c2 = s[pos:pos+2] 
825          if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): 
826              yield Token(c2, pos) 
827              pos += 2 
828              continue 
829          if c in '>+~,.*=[]()|:#': 
830              yield Token(c, pos) 
831              pos += 1 
832              continue 
833          if c == '"' or c == "'": 
834               
835              old_pos = pos 
836              sym, pos = tokenize_escaped_string(s, pos) 
837              yield String(sym, old_pos) 
838              continue 
839          old_pos = pos 
840          sym, pos = tokenize_symbol(s, pos) 
841          yield Symbol(sym, old_pos) 
842          continue 
 843   
845      quote = s[pos] 
846      assert quote in ('"', "'") 
847      pos = pos+1 
848      start = pos 
849      while 1: 
850          next = s.find(quote, pos) 
851          if next == -1: 
852              raise SelectorSyntaxError( 
853                  "Expected closing %s for string in: %r" 
854                  % (quote, s[start:])) 
855          result = s[start:next] 
856          try: 
857              result = result.decode('unicode_escape') 
858          except UnicodeDecodeError: 
859               
860              pos = next+1 
861          else: 
862              return result, next+1 
 863       
864  _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) 
865   
867      start = pos 
868      match = _illegal_symbol.search(s, pos=pos) 
869      if not match: 
870           
871          return s[start:], len(s) 
872      if match.start() == pos: 
873          assert 0, ( 
874              "Unexpected symbol: %r at %s" % (s[pos], pos)) 
875      if not match: 
876          result = s[start:] 
877          pos = len(s) 
878      else: 
879          result = s[start:match.start()] 
880          pos = match.start() 
881      try: 
882          result = result.decode('unicode_escape') 
883      except UnicodeDecodeError, e: 
884          raise SelectorSyntaxError( 
885              "Bad symbol %r: %s" % (result, e)) 
886      return result, pos 
 887   
889   
890 -    def __init__(self, tokens, source=None): 
 891          self.used = [] 
892          self.tokens = iter(tokens) 
893          self.source = source 
894          self.peeked = None 
895          self._peeking = False 
 896   
898          if self._peeking: 
899              self._peeking = False 
900              self.used.append(self.peeked) 
901              return self.peeked 
902          else: 
903              try: 
904                  next = self.tokens.next() 
905                  self.used.append(next) 
906                  return next 
907              except StopIteration: 
908                  return None 
 909   
911          return iter(self.next, None) 
 912   
914          if not self._peeking: 
915              try: 
916                  self.peeked = self.tokens.next() 
917              except StopIteration: 
918                  return None 
919              self._peeking = True 
920          return self.peeked 
  921